library(Matrix)
library(cluster)
library(lattice)
library(plotrix)
library(gridExtra)
library(KernSmooth)
library(boot)
library(parallel)
library(texreg)
library(nnet)
library(xtable)
library(lpSolve)
library(knitr)
.default_par <- par(no.readonly=TRUE)
BASEDIR <- sprintf('%s/github/post-mortem-memory', Sys.getenv('HOME'))
DATADIR <- sprintf('%s/data/', BASEDIR)
PLOTDIR <- sprintf('%s/plots/', BASEDIR)
TABLEDIR <- sprintf('%s/tables/', BASEDIR)
fignum <- 0
tabnum <- 0
figcap <- function(increment) {
if (increment) fignum <<- fignum + 1
sprintf('**%sFigure %s:** ', if (SUPP_INFO) 'Supplementary ' else '', fignum)
}
tabcap <- function(increment) {
if (increment) tabnum <<- tabnum + 1
sprintf('**%sTable %s:** ', if (SUPP_INFO) 'Supplementary ' else '', tabnum)
}
vertskip <- '$$\\\\[4mm]$$'
# We aggregate chunk_size days; a value of 7 means we aggregate by weeks.
CHUNK_SIZE <- 1
# The number of time units ('chunks') before and after death.
N <- 360
COL <- list(NEWS=rgb(.9,.6,0), TWITTER=rgb(0,.45,.7))
COL_LIGHT <- list(NEWS=rgb(.9,.6,0,.3), TWITTER=rgb(0,.45,.7,.3))
COL_XLIGHT <- list(NEWS=rgb(.9,.6,0,.03), TWITTER=rgb(0,.45,.7,.03))
COL_LIGHTBLUE <- "#56B4E9"
COL_YELLOW <- "#F0E442"
COL_DARKBLUE <- "#0072B2"
COL_RED <- "#D55E00"
COL_MAGENTA <- "#CC79A7"
COL_GRAY <- "#999999"
COL_ORANGE <- "#E69F00"
COL_GREEN <- "#009E73"
ANGLO_COUNTRY_REGEX <- 'United States|United Kingdom|Canada|England|Australia|Scotland|Wales|Ireland|New Zealand|South Africa'
type_groups <- c('art', 'sports', 'leadership', 'known for death', 'general fame', 'academia/engineering')
split_at <- function(str, sep) strsplit(str, sep)[[1]]
fancy_mid <- function(mid) sub('<http://rdf.freebase.com/ns/m.(.*)>', '/m/\\1', mid)
long_mid <- function(mid) sub('/m/(.*)', '<http://rdf.freebase.com/ns/m.\\1>', mid)
fancy_wiki <- function(wiki) sub('<http://en.wikipedia.org/wiki/(.*)>', '\\1', wiki)
long_wiki <- function(wiki) sprintf('<http://en.wikipedia.org/wiki/%s>', wiki)
FANCY_CURVE_CHAR_NAMES <- c('Pre-mortem mean', 'Short-term boost', 'Long-term boost', 'Halving time')
names(FANCY_CURVE_CHAR_NAMES) <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
# A list of all days.
MIN_DATE <- as.Date("2008-08-01")
MAX_DATE <- as.Date("2014-09-30")
DAYS <- as.character(seq(MIN_DATE, MAX_DATE, by="+1 day"))
MAX_ABS_RELDATE <- as.numeric(MAX_DATE - MIN_DATE)
# Our Twitter data set is reasonably large only from 2009-06-11 on.
TWITTER_START_DATE <- as.Date("2009-06-11")
# We build a matrix x that has a row per person, and a column per day since death. There are
# nreldates = 2*(MAX_DATE-MIN_DATE)+1 such possible dates: the extreme cases are someone dying on
# MIN_DATE or MAX_DATE, so the relDates are [0 : MAX_DATE-MIN_DATE] in the former case, and
# [MIN_DATE-MAX_DATE : 0] in the latter case, for a total range of [MIN_DATE-MAX_DATE :
# MAX_DATE-MIN_DATE]; there are 2*(MAX_DATE-MIN_DATE)+1 values in this range.
# Values of relDate can be negative, zero, or positive. However, since R indices must be
# positive, we must translate relDates to get indices; the translation is as such:
# idx = relDate + MAX_DATE - MIN_DATE + 1
NRELDATES <- 2 * as.numeric(MAX_DATE - MIN_DATE) + 1
# The days on which our Spinn3r client must have been broken (there are still several tens of
# thousands of articles with dates from these days, but they were crawled on days outside of this
# window).
EMPTY_DAYS <- c(
"2009-05-19", "2009-07-15", "2010-04-05", "2010-04-20", "2010-04-21", "2010-04-22", "2010-04-23", "2010-04-24",
"2010-04-25", "2010-04-26", "2010-04-27", "2010-04-28", "2010-04-29", "2010-04-30", "2010-05-01", "2010-05-02",
"2010-05-03", "2010-05-04", "2010-05-05", "2010-05-06", "2010-05-07", "2010-05-08", "2010-05-09", "2010-05-10",
"2010-05-11", "2010-05-12", "2010-05-13", "2010-05-16", "2010-05-17", "2010-05-18", "2010-05-19", "2010-05-20",
"2010-05-21", "2010-05-22", "2010-05-23", "2010-05-24", "2010-05-25", "2010-05-26", "2010-05-27", "2010-05-28",
"2010-05-29", "2010-05-30", "2010-05-31", "2010-06-01", "2010-06-02", "2010-06-03", "2010-06-04", "2010-06-05",
"2010-06-06", "2010-06-07", "2010-06-08", "2010-06-09", "2010-06-10", "2010-06-11", "2010-06-12", "2010-06-13",
"2010-06-14", "2010-06-15", "2010-06-16", "2010-06-17", "2010-06-18", "2010-06-19", "2010-06-20", "2010-06-21",
"2010-06-22", "2010-06-23", "2010-06-24", "2010-06-25", "2010-06-26", "2010-06-27", "2010-06-28", "2010-06-29",
"2010-06-30", "2010-07-01", "2010-07-02", "2010-07-03", "2010-07-04", "2010-07-05", "2010-07-06", "2010-07-07",
"2010-07-08", "2010-07-09", "2010-07-10", "2010-07-11", "2010-07-12", "2010-07-13", "2011-12-03")
# The dates of death.
death_dates_tbl <- read.table(pipe(sprintf('gunzip -c %s/date_of_death.nt', DATADIR)),
comment.char='', sep='\t', quote='',
col.names=c('mid', '_rel', 'date', '_period'))[,c('mid', 'date')]
death_dates_tbl <- death_dates_tbl[grepl('#date>$', death_dates_tbl$date),]
death_dates_tbl$date <- substring(death_dates_tbl$date, 2, 11)
death_dates <- death_dates_tbl$date
names(death_dates) <- death_dates_tbl$mid
# Our Twitter data set is reasonably large only from 2009-06-11 on.
# Get the mids of the people that died between then and MAX_DATE (Sept. 2014).
died_in_window <- names(death_dates[as.Date(death_dates) >= TWITTER_START_DATE
& as.Date(death_dates) <= MAX_DATE])
# A mapping from mids to Wikipedia names.
wiki_tbl <- read.table(pipe(sprintf('gunzip -c %s/names.DEAD.UNAMBIGUOUS.tsv.gz', DATADIR)),
comment.char='', sep='\t', quote='',
col.names=c('mid', 'names', 'aliases', 'curid', 'wiki'),
stringsAsFactors=FALSE)
wiki_tbl$wiki <- sapply(wiki_tbl$wiki, function(x) split_at(x, '\\|')[1])
mid_to_wiki <- wiki_tbl$wiki
names(mid_to_wiki) <- wiki_tbl$mid
mid_to_wiki <- mid_to_wiki[!is.na(mid_to_wiki)]
# A mapping from mids to names.
wiki_to_mid <- names(mid_to_wiki)
names(wiki_to_mid) <- mid_to_wiki
# Properties of dead people.
props <- read.table(pipe(sprintf('gunzip -c %s/dead_people_properties.tsv.gz', DATADIR)),
comment.char='', sep='\t', quote='', header=TRUE, fill=TRUE,
col.names=c('person', 'cause_of_death', 'date_of_death', 'place_of_death', 'date_of_burial',
'place_of_burial', 'date_of_cremation', 'place_of_cremation', 'date_of_birth',
'place_of_birth', 'nationality', 'profession', 'religion', 'ethnicity',
'notable_types', 'gender'))
props$mid <- sub('(<.*>).*', '\\1', props$person)
props$gender <- as.factor(sub('<.*>(.*)', '\\1', props$gender))
death_years <- sub('"(....).*', '\\1', props$date_of_death)
birth_years <- sub('"(....).*', '\\1', props$date_of_birth)
death_years[!grepl("....", death_years)] <- NA
birth_years[!grepl("....", birth_years)] <- NA
props$age <- as.numeric(death_years) - as.numeric(birth_years)
rownames(props) <- props$mid
# Load the taxonomies.
tax_causes <- read.table(sprintf('%s/taxonomy_causes_of_death.tsv', DATADIR),
header=TRUE, sep='\t', comment.char='', quote='')
rownames(tax_causes) <- paste(tax_causes$mid, tax_causes$cause_of_death, sep='')
tax_types <- read.table(sprintf('%s/taxonomy_notable_types.tsv', DATADIR),
header=TRUE, sep='\t', comment.char='', quote='')
rownames(tax_types) <- paste(tax_types$mid, tax_types$notable_type, sep='')
# Some causes of death are missing from Janice's table. Add them manually.
natural_manual <- "Viral pneumonia|Smallpox|Dementia with Lewy bodies|Heart valve disease|Creutzfeldt–Jakob disease|T-Cell Lymphoma|Adrenocortical carcinoma|Huntington's disease|Congenital heart defect|Squamous-cell carcinoma|Atypical teratoid rhabdoid tumor|Alveolar rhabdomyosarcoma|Appendix cancer|Pyelonephritis|Polymyalgia rheumatica|Polycythemia|Leiomyosarcoma|Astrocytoma"
unnatural_manual <- "Smoke inhalation|Racing Accident|Lightning|Casualty of war|Cocaine overdose|Poisoning|Shootout|Murder–suicide|Accidental drug overdose|Blast injury"
# Create the regexes for (un)natural deaths.
natural_regex <- sprintf(">(%s|%s)($|\\|)",
paste(tax_causes$cause.of.death[tax_causes$level1=='natural'],
collapse='|'), natural_manual)
unnatural_regex <- sprintf(">(%s|%s)($|\\|)",
paste(tax_causes$cause.of.death[tax_causes$level1=='unnatural'],
collapse='|'), unnatural_manual)
# -1 = unnatural, 0 = unknown/conflicting, 1 = natural.
get_cause_of_death_map <- function() {
natural_death_mids <- props$mid[which(grepl(natural_regex, props$cause_of_death))]
unnatural_death_mids <- props$mid[which(grepl(unnatural_regex, props$cause_of_death))]
map <- (props$mid %in% natural_death_mids) - (props$mid %in% unnatural_death_mids)
# If a person has a natural and an unnatural cause, we want to list them under unnatural death.
map[props$mid %in% natural_death_mids & props$mid %in% unnatural_death_mids] <- -1
names(map) <- props$mid
return(map)
}
# e.g., <http://rdf.freebase.com/ns/m.025698c>Baseball Player --> <http://rdf.freebase.com/ns/m.025698c>
strip_plaintext_name_from_mid <- function(mid_with_name) {
sub('(<.*>).*', '\\1', mid_with_name)
}
# tax must be either tax_types or tax_causes;
# data must be a vector with taxonomy entries as values and mids as names.
select_from_tax <- function(level, value, tax, data) {
col <- sprintf('level%d', level)
ok_values <- rownames(tax)[tax[,col] == value]
names(data)[data %in% ok_values]
}
get_num_art <- function(medium) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
num_art_tbl <- read.table(sprintf('%s/num_articles_per_day_%s.tsv', DATADIR, medium),
col.names=c('date', 'num'))
num_art <- num_art_tbl$num
names(num_art) <- num_art_tbl$date
num_art <- num_art[names(num_art) >= as.character(MIN_DATE) & names(num_art) <= as.character(MAX_DATE)]
num_art[setdiff(DAYS, names(num_art))] <- 0
num_art <- num_art[order(names(num_art))]
num_art[EMPTY_DAYS] <- NA
return(num_art)
}
get_mention_freq_table <- function(medium) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
file <- sprintf('%s/RData/dead_people_mentions_%s.RData', DATADIR, medium)
if (!file.exists(file)) {
data <- read.table(pipe(sprintf('gunzip -c %s/num_dead_mentions_per_day_%s.tsv.gz', DATADIR, medium)),
comment.char='', sep='\t', quote='',
col.names=c('mid', 'date', 'num_per_doc', 'num_doc'),
colClasses=c('character', 'character', 'numeric', 'numeric'))
# Add a column having the number of days since death.
data$rel_date <- as.numeric(as.Date(data$date) - as.Date(death_dates[data$mid]))
save(data, file=file)
} else {
load(file)
}
return(data)
}
get_rel_date_matrix <- function(medium, data, num_art, chunk_size) {
if (medium == 'NEWS') {
min_num_per_doc <- 2
} else if (medium == 'TWITTER') {
min_num_per_doc <- 1
} else {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
chunks <- floor(((1:NRELDATES)-(NRELDATES+1)/2)/chunk_size)
sum.na.rm <- function(x) sum(x, na.rm=TRUE)
idx <- which(data$num_per_doc >= min_num_per_doc)
file <- sprintf('%s/RData/num_mentions_per_rel_date_%s_min_num_per_doc=%s_chunk_size=%s.RData',
DATADIR, medium, min_num_per_doc, chunk_size)
if (!file.exists(file)) {
x <- do.call(rbind, mclapply(split(data[idx,], data$mid[idx]), function(l) {
dod <- as.Date(death_dates[l$mid[1]])
# Aggregate the counts of docs containing 1, 2, >=3 mentions.
l <- data.frame(t(simplify2array(by(l, l$date, function(X) c(
as.numeric(as.character(X$rel_date[1])),
as.numeric(as.character(sum(X$num_doc))))))))
colnames(l) <- c('rel_date', 'num_doc')
l$date <- rownames(l)
# Initialize y, the count vector for the current mid.
# y has 2*(MAX_DATE-MIN_DATE)+1 entries, of which only MAX_DATE-MIN_DATE+1 are well-defined (as per
# our discussion of NRELDATES in load_common_data_and_functions.r).
# For the MAX_DATE-MIN_DATE days for which we have no data, the values are set to NA.
# We also set to NA the values for the days that have no Spinn3r data (primarily the 2010 hole).
# For the remaining MAX_DATE-MIN_DATE+1 days, we initialize values to 0.
offset <- as.numeric(MAX_DATE-MIN_DATE) + 1
y <- rep(NA, NRELDATES)
y[as.numeric(as.Date(DAYS)-dod) + offset] <- 0
y[l$rel_date + offset] <- l$num_doc
y[as.numeric(as.Date(EMPTY_DAYS)-dod) + offset] <- NA
# n: the number of docs per day aligned in terms of relative dates.
n <- rep(0, NRELDATES)
n[as.numeric(as.Date(DAYS)-dod) + offset] <- num_art
# Sum within chunks.
s <- tapply(y, chunks, sum.na.rm) / tapply(n, chunks, sum.na.rm)
# Set to NA the days before TWITTER_START_DATE in the case of Twitter.
if (medium == 'TWITTER') {
s[as.numeric(as.Date(DAYS[as.Date(DAYS) < TWITTER_START_DATE])-dod) + offset] <- NA
}
return(s)
}, mc.cores=6))
x <- as.matrix(x)
save(x, file=file)
} else {
load(file)
}
return(x)
}
filter_people <- function(medium, x) {
N_immed_after <- 100
num_finite_before <- rowSums(is.finite(x[,colnames(x) %in% -N:-1]))
num_finite_immed_after <- rowSums(is.finite(x[,colnames(x) %in% 0:(N_immed_after-1)]))
num_active_before <- rowSums(x[,colnames(x) %in% -N:-1] > 0, na.rm=TRUE)
mids <- names(which(
# Keep only people whose boundaries aren't missing.
is.finite(x[,colnames(x)==N]) & is.finite(x[,colnames(x)==-N])
# Keep only people that have no missing data in the N_immed_after days immediately after death.
& num_finite_immed_after == N_immed_after
# Keep only people that were mentioned on at least 5 days before they died.
& num_active_before >= 5
# Keep only people that died after the point from which on we have a lot of Twitter data.
& rownames(x) %in% died_in_window
# Discard people with parentheses in their names because those names, although unique in
# Wikipedia, are unlikely to ever be used in real prose.
& !grepl('\\(', mid_to_wiki[rownames(x)])))
return(mids)
}
# Apply Friedman's super smoother, hell yeah.
supersmooth <- function(y) {
reldates <- as.numeric(names(y))
suppressWarnings({
smoothed_left <- supsmu(reldates[reldates<0], y[reldates<0])
smoothed_right <- supsmu(reldates[reldates>=0], y[reldates>=0])
})
smoothed <- c(smoothed_left$y, smoothed_right$y)
names(smoothed) <- c(smoothed_left$x, smoothed_right$x)
return(smoothed)
}
normalize_and_smooth <- function(medium, x, num_art, mean_center=TRUE) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
# Smoothing term: we add 1 mention (as computed on the highest-volume day) to each day.
eps <- 1 / max(num_art, na.rm=TRUE)
file <- sprintf('%s/RData/clustering_input_%s%s.RData', DATADIR, medium, if (mean_center) '_MEANCENTERED' else '')
if (!file.exists(file)) {
# Select the subset of reldates; keep a month of padding, so smoothing is more robust at the
# boundaries,
X <- x[,colnames(x) %in% -(N+30):(N+30)]
reldates <- as.numeric(colnames(X))
# Add-eps smoothing.
X <- X + eps
# Normalize and smooth all time series.
X <- t(apply(X, 1, function(y_unnorm) {
# Log10-transform.
y <- log10(y_unnorm)
# Smooth.
y <- supersmooth(y)
# Subtract the pre-mortem mean. Now we have log10(y_unnorm/mean*(y_unnorm)), where mean* is the
# exponential of the mean in log space (i.e., a more robust version of the mean).
# In the mean computation we exclude the 30 days immediately before death, to mitigate the
# impact of the final sick days for people whose death was foreseeable.
if (mean_center) y <- y - mean(y[reldates %in% -N:-30], na.rm=TRUE)
# Select the subset of reldates.
y <- y[names(y) %in% -N:N]
# Interpolate missing values; rule=2 means that at the boundaries the values at the boundaries are
# imputed for missing values. (But since we require our time series not to end in a missing value,
# this shouldn't happen in the post-mortem period.)
y <- approx(names(y), y, -N:N, rule=2)$y
return(y)
}))
colnames(X) <- -N:N
save(X, file=file)
} else {
load(file)
}
return(X)
}
# Normalize the time series to [0,1] and measure the fraction of time it takes until half of the
# total post-mortem volume has been seen. The simplest version maps the min to 0, but this is quite
# sensitive to low outliers, so we also allow for versions where we first set to 0 all values
# less than the specified quantile.
time_till_half <- function(y, thresh_quantile=0) {
# Start with the day after death, since the news might not be reporting yet on the day of.
y <- y[as.numeric(names(y)) >= 1]
y <- pmax(y, quantile(y, thresh_quantile))
y <- (y - min(y)) / (max(y) - min(y))
min(which(cumsum(y) >= 0.5*sum(y))) / length(y)
}
compute_curve_stats <- function(medium, x, X, num_art) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
file <- sprintf('%s/mention_freq_for_spreadsheet_%s.csv', DATADIR, medium)
if (!file.exists(file)) {
# Find the people that died (un)natural deaths.
mids <- rownames(x)
mid_nat <- intersect(mids, names(which(get_cause_of_death_map()==1)))
mid_unnat <- intersect(mids, names(which(get_cause_of_death_map()==-1)))
# The number of days we consider before and after death.
subrange_before <- -N:-30
subrange_peak <- 0:29
subrange_after <- 30:N
x <- x[,colnames(x) %in% -N:N]
num_active <- rowSums(x > 0, na.rm=TRUE)
num_active_before <- rowSums(x[,colnames(x) %in% -N:-1] > 0, na.rm=TRUE)
num_active_after <- rowSums(x[,colnames(x) %in% 0:N] > 0, na.rm=TRUE)
num_finite_before <- rowSums(is.finite(x[,colnames(x) %in% -N:-1]))
num_finite_after <- rowSums(is.finite(x[,colnames(x) %in% 0:N]))
stats <- data.frame(mid=NA,
name=NA,
image=NA,
num_active_days=NA,
active_fraction_before=NA,
active_fraction_after=NA,
max_smoothed_before=NA,
max_raw_before=NA,
mean_before=NA,
mean_after=NA,
mean_after_peak=NA,
peak_smoothed=NA,
peak_raw=NA,
peak_raw_day=NA,
time_till_half=NA,
time_till_half_thresh=NA,
natural_death=NA,
unnatural_death=NA,
gender=NA,
cause_of_death=NA,
age=NA,
notable_type=NA)[-1,]
for (i in 1:nrow(X)) {
if (i %% 100 == 0) print(i)
mid <- rownames(X)[i]
name <- fancy_wiki(mid_to_wiki[mid])
img_name <- if (!is.na(name)) gsub('(%(25)?..)+', '+', name) else sprintf('NA_%s', gsub('.*/m\\.(.*)>', '\\1', mid))
raw <- log10(x[mid,])
smoothed <- X[mid,]
mean_before <- mean(smoothed[names(smoothed) %in% subrange_before], na.rm=TRUE)
peak_smoothed <- max(smoothed[names(smoothed) %in% subrange_peak])
peak_raw <- max(raw[names(raw) %in% subrange_peak])
# -1 such that 0 represents day of death.
peak_raw_day <- which.max(raw[names(raw) %in% subrange_peak]) - 1
stats[mid,] <- c(
fancy_mid(mid),
name,
sprintf('=image("http://infolab.stanford.edu/~west1/death/mention_freq_curves/%s/%s.png")',
medium, img_name),
num_active[mid],
num_active_before[mid]/num_finite_before[mid],
num_active_after[mid]/num_finite_after[mid],
max(smoothed[names(smoothed) %in% subrange_before], na.rm=TRUE),
max(raw[names(raw) %in% subrange_before], na.rm=TRUE),
mean_before,
mean(smoothed[names(smoothed) %in% c(subrange_peak, subrange_after)]),
mean(smoothed[names(smoothed) %in% subrange_after]),
peak_smoothed,
peak_raw,
peak_raw_day,
time_till_half(smoothed),
time_till_half(smoothed, 0.25),
if (mid %in% mid_nat) 1 else 0,
if (mid %in% mid_unnat) 1 else 0,
as.character(props[mid, 'gender']),
as.character(props[mid, 'cause_of_death']),
props[mid, 'age'],
as.character(props[mid, 'notable_types'])
)
}
# Write the data into a CSV file.
write.table(stats, file, quote=FALSE, sep='\t', row.names=FALSE)
} else {
stats <- read.table(file, header=TRUE, sep='\t', comment.char='', quote='')
}
return(stats)
}
load_stats <- function(medium, x, X, num_art) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
stats <- compute_curve_stats(medium, x, X, num_art)
rownames(stats) <- long_mid(stats$mid)
# If the person has 0 mentions on the day of death, the log will be -Inf.
# Replace those by the smallest finite number.
stats$peak_raw[!is.finite(stats$peak_raw)] <- min(stats$peak_raw[is.finite(stats$peak_raw)])
# Since the quantities are logarithmic (log10), taking the difference is equivalent to the log10
# of the ratio.
stats$peak_mean_boost <- stats$peak_raw - stats$mean_before
stats$peak_max_boost <- stats$peak_raw - stats$max_raw_before
stats$perm_boost <- stats$mean_after_peak - stats$mean_before
stats$perm_boost_all <- stats$mean_after - stats$mean_before
# Some rank transforms.
r <- function(x) {
n <- length(x)
# Keys: indices in x; values: ranks.
rank_map <- 1:n
names(rank_map) <- order(x)
rank_map[as.character(1:n)]
}
stats$mean_before_rank <- r(stats$mean_before)
stats$mean_after_rank <- r(stats$mean_after)
stats$peak_mean_boost_rank <- r(stats$peak_mean_boost)
stats$perm_boost_rank <- r(stats$perm_boost)
stats$perm_boost_all_rank <- r(stats$perm_boost_all)
stats$time_till_half_rank <- r(stats$time_till_half)
# Add a categorical death-type column.
stats$death_type <- 'unknown'
stats$death_type[stats$natural_death==1] <- 'natural'
stats$death_type[stats$unnatural_death==1] <- 'unnatural'
stats$death_type <- as.factor(stats$death_type)
# Add age groups.
stats$age_group <- floor(stats$age/10)*10
# Add anglo flag.
country_known <- props$mid[props$nationality!='']
anglos <- props$mid[grepl(ANGLO_COUNTRY_REGEX, props$nationality)]
stats$anglo <- 'unknown'
stats$anglo[rownames(stats) %in% country_known] <- 'non_anglo'
stats$anglo[rownames(stats) %in% anglos] <- 'anglo'
stats$anglo <- as.factor(stats$anglo)
# Notable types for all people.
types <- tax_types[strip_plaintext_name_from_mid(props$notable_types), 'level1']
names(types) <- props$mid
stats$type_group <- types[rownames(stats)]
return(stats)
}
# The stats of all people, rather than just the couple of thousands included in our study.
load_stats_all <- function() {
stats_all <- props[props$mid %in% died_in_window, c('gender', 'age', 'cause_of_death', 'nationality', 'notable_types')]
stats_all$death_type <- 'unknown'
stats_all$death_type[rownames(stats_all) %in% names(which(get_cause_of_death_map()==1))] <- 'natural'
stats_all$death_type[rownames(stats_all) %in% names(which(get_cause_of_death_map()==-1))] <- 'unnatural'
stats_all$death_type <- as.factor(stats_all$death_type)
country_known <- rownames(stats_all)[stats_all$nationality!='']
anglos <- rownames(stats_all)[grepl(ANGLO_COUNTRY_REGEX, stats_all$nationality)]
stats_all$anglo <- 'unknown'
stats_all$anglo[rownames(stats_all) %in% country_known] <- 'non_anglo'
stats_all$anglo[rownames(stats_all) %in% anglos] <- 'anglo'
stats_all$anglo <- as.factor(stats_all$anglo)
types <- tax_types[strip_plaintext_name_from_mid(stats_all$notable_types), 'level1']
names(types) <- rownames(stats_all)
stats_all$type_group <- types[rownames(stats_all)]
return(stats_all)
}
make_bio_table <- function() {
to_percentage <- function(x) sprintf('%.0f%%', x*100)
summary_table_all <- NULL
summary_table_filtered <- NULL
summary_table_lm <- NULL
### Age
summary_table_filtered['Age'] <- ''
summary_table_all['Age'] <- ''
summary_table_lm['Age'] <- ''
summary_table_filtered['Age N/A'] <- to_percentage(mean(is.na(stats_N$age)))
summary_table_filtered[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(stats_N$age)[c(2,4,3,5)])
summary_table_all['Age N/A'] <- to_percentage(mean(is.na(stats_all$age)))
summary_table_all[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(stats_all$age[stats_all$age <= 120])[c(2,4,3,5)])
summary_table_lm['Age N/A'] <- to_percentage(mean(is.na(lmdata_N$age)))
summary_table_lm[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(lmdata_N$age)[c(2,4,3,5)])
### Gender
summary_table_filtered['Gender'] <- ''
summary_table_all['Gender'] <- ''
summary_table_lm['Gender'] <- ''
idx <- stats_N$gender %in% c('Female', 'Male')
summary_table_filtered['Gender N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[c('Female', 'Male')] <- to_percentage((summary(stats_N$gender[idx]) / sum(summary(stats_N$gender[idx])))[c('Female', 'Male')])
idx <- stats_all$gender %in% c('Female', 'Male')
summary_table_all['Gender N/A'] <- to_percentage(mean(!idx))
summary_table_all[c('Female', 'Male')] <- to_percentage((summary(stats_all$gender[idx]) / sum(summary(stats_all$gender[idx])))[c('Female', 'Male')])
idx <- lmdata_N$gender %in% c('Female', 'Male')
summary_table_lm['Gender N/A'] <- to_percentage(mean(!idx))
summary_table_lm[c('Female', 'Male')] <- to_percentage((summary(lmdata_N$gender[idx]) / sum(summary(lmdata_N$gender[idx])))[c('Female', 'Male')])
### Manner of death
summary_table_filtered['Manner of death'] <- ''
summary_table_all['Manner of death'] <- ''
summary_table_lm['Manner of death'] <- ''
idx <- stats_N$death_type != 'unknown'
summary_table_filtered['Manner of death N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[c('Natural', 'Unnatural')] <-
to_percentage((summary(stats_N$death_type[idx]) / sum(summary(stats_N$death_type[idx])))[c('natural', 'unnatural')])
idx <- stats_all$death_type != 'unknown'
summary_table_all['Manner of death N/A'] <- to_percentage(mean(!idx))
summary_table_all[c('Natural', 'Unnatural')] <-
to_percentage((summary(stats_all$death_type[idx]) / sum(summary(stats_all$death_type[idx])))[c('natural', 'unnatural')])
idx <- lmdata_N$death_type != 'unknown'
summary_table_lm['Manner of death N/A'] <- to_percentage(mean(!idx))
summary_table_lm[c('Natural', 'Unnatural')] <-
to_percentage((summary(lmdata_N$death_type[idx]) / sum(summary(lmdata_N$death_type[idx])))[c('natural', 'unnatural')])
### Cultural background
summary_table_filtered['Cultural background'] <- ''
summary_table_all['Cultural background'] <- ''
summary_table_lm['Cultural background'] <- ''
idx <- stats_N$anglo != 'unknown'
summary_table_filtered['Cultural background N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(stats_N$anglo[idx]) / sum(summary(stats_N$anglo[idx])))[c('anglo', 'non_anglo')])
idx <- stats_all$anglo != 'unknown'
summary_table_all['Cultural background N/A'] <- to_percentage(mean(!idx))
summary_table_all[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(stats_all$anglo[idx]) / sum(summary(stats_all$anglo[idx])))[c('anglo', 'non_anglo')])
idx <- lmdata_N$anglo != 'unknown'
summary_table_lm['Cultural background N/A'] <- to_percentage(mean(!idx))
summary_table_lm[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(lmdata_N$anglo[idx]) / sum(summary(lmdata_N$anglo[idx])))[c('anglo', 'non_anglo')])
### Notable type
summary_table_filtered['Notability type'] <- ''
summary_table_all['Notability type'] <- ''
summary_table_lm['Notability type'] <- ''
idx <- !is.na(stats_N$type_group)
summ <- summary(stats_N$type_group[idx])[type_groups]
summary_table_filtered['Notability type N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[names(summ)] <- to_percentage(summ / sum(summ))
idx <- !is.na(stats_all$type_group)
summ <- summary(stats_all$type_group[idx])[type_groups]
summary_table_all['Notability type N/A'] <- to_percentage(mean(!idx))
summary_table_all[names(summ)] <- to_percentage(summ / sum(summ))
idx <- !is.na(lmdata_N$type_group)
summ <- summary(lmdata_N$type_group[idx])[type_groups]
summary_table_lm['Notability type N/A'] <- to_percentage(mean(!idx))
summary_table_lm[names(summ)] <- to_percentage(summ / sum(summ))
summary_table_filtered['Count'] <- nrow(stats_N)
summary_table_all['Count'] <- nrow(stats_all)
summary_table_lm['Count'] <- nrow(lmdata_N)
# Return a table with stats for both "all" and "filtered" (i.e., those included in the study).
data.frame(All=summary_table_all, Included=summary_table_filtered, Regression=summary_table_lm)
}
plot_num_art <- function(num_art, medium) {
par(mar=c(6,5,2,2))
idx <- which(names(num_art)==TWITTER_START_DATE):which(names(num_art)==MAX_DATE)
plot(num_art[idx], axes=FALSE, log='y', type='p', xlab='', ylab='Number of documents',
col=COL[[medium]], main=medium, ylim=c(5e4,1e8))
axis(2)
days <- DAYS[idx]
ticks <- which(grepl('-01$', days))
axis(1, at=ticks, labels=rep('', length(ticks)), col.ticks='gray')
ticks <- which(grepl('-01-01$', days))
axis(1, at=ticks, labels=days[ticks], las=2)
# par(.default_par)
}
plot_time_series_for_small_multiples <- function(name, medium, X, x, num_art,
xlab=TRUE, ylab=TRUE) {
if (SAVE_PLOTS) pdf(sprintf('%s/mention_curve_%s_%s.pdf', PLOTDIR, name, medium), width=2, height=2,
pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mar=c(3,3,1,1))
mid <- wiki_to_mid[sprintf('<http://en.wikipedia.org/wiki/%s>', name)]
ymin <- log10(1 / max(num_art, na.rm=TRUE))
raw <- log10(x[mid, colnames(x) %in% -100:360])
smoothed <- X[mid, colnames(X) %in% -100:360]
plot(-100:360, raw, col=COL_LIGHT[[medium]],
bty='n', xlab='', xlim=c(-100,360), ylim=c(ymin,-1), ylab='',
panel.first=abline(v=0, col='gray'))
text(360, -1, sprintf('%s', gsub('_', ' ', name)), adj=c(1,1), cex=1)
lines(names(smoothed), smoothed, col=COL[[medium]], lwd=2)
if (xlab) mtext('Days since death', side=1, line=2)
if (ylab) mtext('Fraction mentioning docs [log10]', side=2, line=2)
if (SAVE_PLOTS) dev.off()
}
plot_max_mem_hist <- function(medium) {
if (SAVE_PLOTS) cairo_pdf(sprintf('%s/max_mem_hist_%s.pdf', PLOTDIR, medium), width=3.4, height=2.4,
pointsize=8, family='Helvetica')
par(mar=c(3,3,1,3))
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
t_max <- 100
max_days <- apply(x[,colnames(x) %in% 0:t_max], 1, function(x) which.max(x) - 1)
h <- hist(max_days, breaks=0:(t_max+1), right=FALSE, include.lowest=FALSE, xlab='', ylab='',
main=NULL, axes=FALSE, col=COL[[medium]], border=NA, xlim=c(0,t_max+1), ylim=c(0,800))
cum <- cumsum(h$counts)/sum(h$counts)
ticks <- seq(0, t_max, 10)
axis(1, at=ticks+0.5, labels=ticks, las=2)
mtext('Days since death', side=1, line=2)
axis(2, col=COL[[medium]], col.axis=COL[[medium]])
mtext('Number of people', side=2, line=2, col=COL[[medium]])
par(new=TRUE)
plot((0:t_max)+0.5, cum, type='p', xlab='', ylab='', xlim=c(0,t_max+1), ylim=c(0,1), axes=FALSE,
panel.first=abline(h=cum[30], v=29.5, col='gray', lty=2))
axis(4)
mtext('Cumulative rel. frequency', side=4, line=2)
text(29.5, cum[30], labels=sprintf('%.1f%% had maximum mention\n frequency by day 29', 100*cum[30]),
adj=c(-0.1,1.5), col='gray')
if (SAVE_PLOTS) dev.off()
return(list(hist=h, max_days=max_days))
}
biexp_fit <- function(medium) {
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
days <- 1:400
# Map the first post-mortem day to t = 0.
t <- days-1
eps <- min(x[x>0], na.rm=TRUE)
y <- 10^colMeans(log10(x[,colnames(x) %in% days] + eps), na.rm=TRUE)
df <- data.frame(y=y, t=t)
# First approximation: r = 0.
# log(y) ~ log(N*exp(-p*t)) = log(N)-p*t
model0 <- lm(log(y) ~ t, data=df)
N0 <- exp(coef(model0)[1])
p0 <- -coef(model0)[2]
# Second approximation: q = 0.
model1 <- nls(log(y) ~ log( N/(p+r) * (p*exp(-(p+r)*t) + r) ),
start=list(p=p0, N=N0, r=0),
data=df, control=list(maxiter=1e8), algorithm='port')
N1 <- coef(model1)['N']
p1 <- coef(model1)['p']
r1 <- coef(model1)['r']
# Full model.
model <- nls(log(y) ~ log( N/(p-q+r) * ((p-q)*exp(-(p+r)*t) + r*exp(-q*t)) ),
start=list(p=p1, N=N1, r=r1, q=0),
data=df, control=list(maxiter=1e8), algorithm='port', lower=c(q=0))
return(coef(model))
}
plot_avg_fraction_of_mentioning_docs <- function(medium, biexp_coefs=NULL) {
if (SAVE_PLOTS) pdf(sprintf('%s/avg_mention_curve_%s.pdf', PLOTDIR, medium), width=3.4, height=3.4,
pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mar=c(3,3,1,1))
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
max_day <- 400
days <- -100:max_day
eps <- min(x[x>0], na.rm=TRUE)
# Log.
y <- colMeans(log10(x[,colnames(x) %in% days] + eps), na.rm=TRUE)
par(fig=c(0, 1, 0, 1))
plot(days, y, type='p', bty='n', lwd=2, xlab='', ylab='', #xaxt='n', yaxt='n',
panel.first=abline(v=0, col='gray', lwd=1, lty=2), col=COL[[medium]], las=0)
mtext('Days since death', side=1, line=2)
mtext('Fraction mentioning documents [log10]', side=2, line=2)
y365 <- y['365']
arrows(x0=325, x1=355, y0=y365, y1=y365, length=0.05)
text(325, y365, '365 days', adj=c(1.1, 0.5))
if (!is.null(biexp_coefs)) {
N <- biexp_coefs['N']
p <- biexp_coefs['p']
r <- biexp_coefs['r']
q <- biexp_coefs['q']
t <- 0:(max(days)-1)
mem_comm <- N * exp(-(p+r)*t)
mem_cult <- N/(p-q+r)*r * (exp(-q*t) - exp(-(p+r)*t))
mem <- mem_comm + mem_cult
tt <- t+1
lines(tt, log10(mem_comm), lty=1, lwd=2, col=COL_GREEN)
lines(tt, log10(mem_cult), lty=1, lwd=2, col=COL_RED)
lines(tt, log10(mem), lty=1, lwd=2, col='black')
legend('topright', bty='n',
legend=c('Daily average', 'Biexponential fit S(t) = u(t) + v(t)',
'Communicative memory u(t)', 'Cultural memory v(t)'),
seg.len=3,
pch=c(1,NA,NA,NA),
lty=c(0,1,1,1), #lty=c(0,1,4,2),
col=c(COL[[medium]], 'black', COL_GREEN, COL_RED),
lwd=c(2, 2, 2, 2))
# Log-log.
par(fig=c(0.35, 0.9, 0.27, 0.77), new=TRUE)
y <- y[names(y) %in% tt]
# Here the day of death is shown as day 1 (due to log axes).
plot(tt, y, type='p', bty='n', xlab='', ylab='', lwd=2, col=COL[[medium]], las=0, log='x')
lines(tt, log10(mem_comm), lty=1, lwd=2, col=COL_GREEN)
lines(tt, log10(mem_cult), lty=1, lwd=2, col=COL_RED)
lines(tt, log10(mem), lty=1, lwd=2, col='black')
legend('topright', bty='n', legend='Log x-scale', inset=c(0.1,0.2))
}
if (SAVE_PLOTS) dev.off()
}
plot_comm_vs_cult_mem <- function(medium, biexp_coefs=NULL) {
if (SAVE_PLOTS) pdf(sprintf('%s/comm_vs_cult_mem_%s.pdf', PLOTDIR, medium), width=3.4, height=2.4,
pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mar=c(4,4,1,1))
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
max_day <- 40
t <- seq(0, max_day-1, 0.01)
N <- biexp_coefs['N']
p <- biexp_coefs['p']
r <- biexp_coefs['r']
q <- biexp_coefs['q']
u <- function(t, N, p, r, q) N * exp(-(p+r)*t)
v <- function(t, N, p, r, q) N/(p-q+r)*r * (exp(-q*t) - exp(-(p+r)*t))
mem_comm <- u(t, N, p, r, q)
mem_cult <- v(t, N, p, r, q)
ratio <- mem_cult / (mem_comm + mem_cult)
plot(t+1, ratio, type='l', bty='n', col=COL[[medium]], lwd=2,
xlab='Days since death', ylab='Fraction of cultural memory')
for (perc in c(0.5, 0.99)) {
idx <- min(which(ratio>=perc))
points(t[idx]+1, ratio[idx], lwd=3, pch=20, col='black', cex=2)
segments(-5, ratio[idx], t[idx]+1, ratio[idx], col='black', lty=2)
segments(t[idx]+1, -5, t[idx]+1, ratio[idx], col='black', lty=2)
text(t[idx]+1, ratio[idx], adj=c(-0.2, 1.2),
labels=sprintf('%.0f%% after\n%.0f days', 100*perc, ceiling(t[idx]+1)))
}
if (SAVE_PLOTS) dev.off()
}
overlay_time_series <- function(X, medium) {
matplot(-N:N, t(X), type='l', lty=1, lwd=2, col=COL_XLIGHT[[medium]], xlab='Days since death',
ylab='Fraction mentioning docs [log10]', ylim=range(X), main=medium, bty='n')
}
smoothed_density <- function(x, bw=0.05, range=c(0,1)) {
# Reflect the data to avoid drops at boundaries.
left <- -(x-range[1]) + range[1]
right <- left + 2*(range[2]-range[1])
x <- c(left, x, right)
est <- bkde(x, bandwidth=bw, range=range(x), gridsize=1000)
# Consider only the original range.
idx <- which(est$x >= range[1] & est$x <= range[2])
est$x <- est$x[idx]
# Renormalize.
est$y <- 3*est$y[idx]
est
}
# Smoothed histograms of curve properties for the different people groups.
plot_smoothed_densities_raw <- function(stats, medium, prop, groups, main=NULL) {
nbw <- 20
chars <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
par(mfrow=c(length(groups), length(chars)), mar=c(4,4,2,2))
for (char in chars) {
# Find the extreme y-value to be plotted.
ymin <- Inf
ymax <- -Inf
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- stats[mask,char]
sm <- smoothed_density(x, bw=(max(x)-min(x))/nbw, range=range(x))$y
ymin <- min(ymin, min(sm))
ymax <- max(ymax, max(sm))
}
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- stats[mask,char]
est <- smoothed_density(x, bw=(max(x)-min(x))/nbw, range=range(x))
# The estimate without reflected data, that is it drops at the boundaries.
est_ <- bkde(x, bandwidth=(max(x)-min(x))/nbw, range=range(x))
plot(est$x, est$y, ylim=c(ymin,ymax), type='l', lwd=2,
main=if (is.null(main)) group else main,
col=COL[[medium]], xlab=FANCY_CURVE_CHAR_NAMES[char], ylab='Density', bty='n')
rug(x)
}
}
par(.default_par)
}
plot_chars_by_group <- function(stats, medium, prop, groups) {
chars <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
par(mfrow=c(1, length(chars)), mar=c(10,4,2,2))
for (char in chars) {
char_rank <- sprintf('%s_rank', char)
grouped <- split(stats[,char_rank]/nrow(stats), prop)[as.character(groups)]
boxplot(grouped, las=3, notch=TRUE, main=FANCY_CURVE_CHAR_NAMES[char],
col=COL_LIGHT[[medium]], bty='n', ylab='Relative rank')
}
par(.default_par)
}
smoothed_2d_density <- function(x, bw) {
# Reflect the data to avoid drops at boundaries.
x <- rbind(cbind( -x[,1], -x[,2]),
cbind( -x[,1], x[,2]),
cbind( -x[,1], 2-x[,2]),
cbind( x[,1], -x[,2]),
cbind( x[,1], x[,2]),
cbind( x[,1], 2-x[,2]),
cbind(2-x[,1], -x[,2]),
cbind(2-x[,1], x[,2]),
cbind(2-x[,1], 2-x[,2]))
est <- bkde2D(x, range.x=list(c(-1,2), c(-1,2)), bandwidth=c(bw,bw), gridsize=c(153,153))
# Consider only the original range [0,1].
idx <- which(est$x1 >= 0 & est$x1 <= 1)
est$x1 <- est$x1[idx]
est$x2 <- est$x2[idx]
# Renormalize.
est$fhat <- 9*est$fhat[idx,idx]
est
}
plot_smoothed_2d_densities <- function(stats, medium, char1, char2, prop, groups, main=NULL) {
bw <- 0.1
char1_rank <- sprintf('%s_rank', char1)
char2_rank <- sprintf('%s_rank', char2)
# Find the maximum y-value to be plotted.
ymax <- -Inf
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- cbind(stats[mask, char1_rank], stats[mask, char2_rank])/length(mask)
ymax <- max(ymax, max(smoothed_2d_density(x, bw=bw)$fhat))
}
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- cbind(stats[mask, char1_rank], stats[mask, char2_rank])/length(mask)
est <- smoothed_2d_density(x, bw=bw)
filled.contour(est$x1, est$x2, est$fhat, zlim=c(0,ymax),
color=function(n) gray.colors(n,0,1),
plot.title=title(main=sprintf('%s\n(%s)', if (is.null(main)) group else main, medium),
xlab=char1, ylab=char2))
par(.default_par)
}
}
cluster <- function(stats, feats) {
input <- cbind(stats[,feats])
colnames(input) <- feats
# z-score normalize.
input <- t(t(input) - colMeans(input))
input <- t(t(input) / apply(input, 2, sd))
K <- 2:30
kmeans_all <- mclapply(K, function(k) { set.seed(1); kmeans(input, k, nstart=20) })
# Reorder the clusters.
for (k in K) {
km <- kmeans_all[[which(K==k)]]
ord <- order(km$size, decreasing=TRUE)
ord_map <- 1:k; names(ord_map) <- ord
km$size <- km$size[ord]
km$withinss <- km$withinss[ord]
km$centers <- km$centers[ord,]
rownames(km$centers) <- 1:k
km$cluster <- ord_map[as.character(km$cluster)]
kmeans_all[[which(K==k)]] <- km
}
list(clustering=kmeans_all, diss=dist(input), K=K)
}
plot_sil_width <- function(medium, kmeans_all, diss) {
avg_sil_width_kmeans <- unlist(lapply(kmeans_all, function(km) summary(silhouette(km$cluster, diss^2))$avg.width))
plot(2:(length(kmeans_all)+1), avg_sil_width_kmeans, type='b', xlab='Number of clusters',
ylab='Avg. silhouette width', main=medium, col=COL[[medium]], lwd=2, bty='n')
}
plot_clustered_time_series <- function(medium, kmeans, X) {
k <- nrow(kmeans$centers)
par(mfrow=c(1, k))
for (j in 1:k) {
matplot(-N:N, t(X[kmeans$cluster==j,]), type='l', lty=1, lwd=2, col=COL_XLIGHT[[medium]],
xlab='Days since death', ylab='% mentioning docs [log10]', ylim=range(X),
main=sprintf('%s cluster %s', medium, j))
}
par(.default_par)
}
boot_ci <- function(x, func, R=5000) {
bo <- boot(x, statistic=function(d, i) func(d[i]), R=R)
ci <- boot.ci(bo, conf=0.95, type="basic")$basic[4:5]
if (is.null(ci)) {
upper <- lower <- NA
} else {
lower <- ci[1]
upper <- ci[2]
}
unlist(list(upper=upper, point_est=func(x), lower=lower))
}
make_regression_data <- function(medium) {
if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
stats <- if (medium=='NEWS') stats_N else stats_T
data <- stats[stats$gender != ''
& stats$death_type != 'unknown'
& !is.na(stats$type_group) # This removes only 4 people.
& !is.na(stats$age_group)
& stats$age %in% 20:99,]
data$age_group <- factor(data$age_group)
# Mean (median) age is 71 (74), so use the age bracket 70-79 as the reference level.
base_age_group <- '70'
base_death_type <- 'unknown'
base_death_type <- 'natural'
base_type_group <- 'art' # 'general fame'
base_gender <- 'Male'
base_anglo <- 'anglo'
data$age_group <- relevel(data$age_group, base_age_group)
data$death_type <- relevel(data$death_type, base_death_type)
data$type_group <- relevel(data$type_group, base_type_group)
data$gender <- relevel(data$gender, base_gender)
data$anglo <- relevel(data$anglo, base_anglo)
N <- dim(data)[1]
relranks <- (0:(N-1))/(N-1) - 0.5
data$mean_before_relrank[order(data$mean_before_rank)] <- relranks
data$peak_mean_boost_relrank[order(data$peak_mean_boost_rank)] <- relranks
data$perm_boost_relrank[order(data$perm_boost_rank)] <- relranks
return(data)
}
# The real uncertainty in the summed coefficients is larger; need to add separate SEs and entries of vcov
# matrix: https://biologyforfun.wordpress.com/2017/05/17/adding-standard-errors-for-interaction-terms/comment-page-1/
compound_se_for_lm <- function(mod, interact=NULL, base_age=NULL) {
X <- as.data.frame(vcov(mod))
idx <- rownames(X)[grep('age_group..$', rownames(X))]
X[sprintf('age_group%s:%s', base_age, interact),] <- 0
X[,sprintf('age_group%s:%s', base_age, interact)] <- 0
sapply(idx, function(i) {
sub_idx <- c(i, if (is.null(interact)) NULL else c(interact, sprintf('%s:%s', i, interact)))
sqrt(sum(X[sub_idx, sub_idx]))
})
}
run_regression_analysis_for_death_types <- function(medium, lmdata) {
if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
lmdata <- if (medium == 'NEWS') lmdata_N else lmdata_T
base_age <- levels(lmdata$age_group)[1]
base_death_type <- levels(lmdata$death_type)[1]
# Compare natural vs. unnatural death for the different age brackets (0 to omit the
# intercept and include it instead as the coefficient of the base age).
predictors <- '0 + mean_before_relrank + age_group + death_type + gender + anglo + type_group + age_group:death_type'
interact <- sprintf('death_type%s', if (base_death_type=='natural') 'unnatural' else 'natural')
mod_peak <- lm(as.formula(sprintf('peak_mean_boost_relrank ~ %s', predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf( 'perm_boost_relrank ~ %s', predictors)), lmdata)
# These plots show the mean relative rank (w.r.t. output measure) as a function of age,
# for a male anglo artist of median pre-mortem popularity.
ylim <- c(-0.3,0.6)
plot_age_coeffs_for_death_types(mod_peak, medium, 'peak_mean_boost', base_age, ylim)
plot_age_coeffs_for_death_types(mod_perm, medium, 'perm_boost', base_age, ylim)
return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}
plot_age_coeffs_for_death_types <- function(mod, medium, outcome, base_age, ylim=NULL, col=COL[[medium]]) {
summary <- coef(summary(mod))
# Non-interaction terms (i.e., natural death).
idx <- grep('^age_group..$', rownames(summary))
decades <- as.numeric(sub('.*(\\d\\d).*', '\\1', rownames(summary))[idx])
ord <- order(decades)
decades <- decades[ord]
age_coef_natural <- summary[idx,1][ord]
se_natural <- compound_se_for_lm(mod)[ord]
# Interaction terms (i.e., unnatural death).
idx <- grep('^age_group..:death_typeunnatural$', rownames(summary))
summary[idx,1] <- summary[idx,1] + summary['death_typeunnatural',1]
rownames(summary)[rownames(summary)=='death_typeunnatural'] <- sprintf('age_group%s:death_typeunnatural', base_age)
# Add age_group baseline coefs to interaction coefs.
idx <- grep('^age_group..:death_typeunnatural$', rownames(summary))
summary[idx,1] <- summary[idx,1] + summary[grep('age_group..$', rownames(summary)), 1]
age_coef_unnatural <- summary[idx,1][ord]
se_unnatural <- compound_se_for_lm(mod, 'death_typeunnatural', base_age)[ord]
if (is.null(ylim)) ylim <- range(c(age_coef_natural - 2*se_natural,
age_coef_natural + 2*se_natural,
age_coef_unnatural - 2*se_unnatural,
age_coef_unnatural + 2*se_unnatural), na.rm=TRUE)
col_unnat <- 'gray'
if (outcome == 'peak_mean_boost') {
main <- sprintf('Short-term boost', medium)
outfile <- sprintf('%s/age_coefs_%s_%s.pdf', PLOTDIR, medium, outcome)
} else if (outcome == 'perm_boost') {
main <- sprintf('Long-term boost', medium)
outfile <- sprintf('%s/age_coefs_%s_%s.pdf', PLOTDIR, medium, outcome)
} else if (outcome == 'peak_mean_boost_diff') {
main <- 'Short-term boost'
outfile <- sprintf('%s/age_coefs_NEWS-VS_TWITTER_%s.pdf', PLOTDIR, outcome)
} else if (outcome == 'perm_boost_diff') {
main <- 'Long-term boost'
outfile <- sprintf('%s/age_coefs_NEWS-VS_TWITTER_%s.pdf', PLOTDIR, outcome)
} else {
stop('Illegal outcome name')
}
if (SAVE_PLOTS) cairo_pdf(outfile, width=3.4, height=2, pointsize=8, family='Helvetica')
par(mar=c(3,3,1,1))
plot(decades+6, age_coef_unnatural, type='b', panel.first=abline(h=0, col='gray'),
ylim=ylim, xlim=c(min(decades), max(decades)+10), bty='n', xaxt='n', yaxt='n',
main=main, xlab='', ylab='', lwd=2, col=col_unnat)
dispersion(decades+6, age_coef_unnatural, 2*se_unnatural, col=col_unnat, arrow.cap=0.005)
axis(1, at=c(decades, max(decades)+10), line=0)
axis(2, line=0)
mtext('Age at death', 1, line=2)
mtext('Effect size', 2, line=2)
points(decades+4, age_coef_natural, type='b', lwd=2, col=col)
dispersion(decades+4, age_coef_natural, 2*se_natural, col=col, arrow.cap=0.005)
legend('bottomleft', c('Natural death', 'Unnatural death'), col=c(col, col_unnat), lty=1, lwd=2, horiz=TRUE, bty='n')
if (SAVE_PLOTS) dev.off()
}
draw_mega_figure <- function() {
days <- -100:360
feat_cols <- c(mean_before=COL_DARKBLUE, peak_mean_boost=COL_MAGENTA, perm_boost=COL_GREEN, time_till_half=COL_RED)
bg <- function(col) rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col=col, border=NA)
mar <- 0.2
if (SAVE_PLOTS) cairo_pdf(sprintf('%s/megafigure.pdf', PLOTDIR), width=8, height=8, pointsize=6, family='Helvetica')
par(.default_par)
plot.new()
rel_line_height <- par('mai')[1]/par('mar')[1]/par('din')
par(fig=c(0.59, 0.91, 0.09, 0.41), new=TRUE)
par(mar=c(0,0,0,0))
confusion <- table(as.factor(km_N$cluster), as.factor(km_T$cluster))
palmat <- color.scale(confusion^0.5, cs1=c(1,1), cs2=c(1,0.2), cs3=c(1,0.2))
# Must fix the function definition of color2D.matplot: do fix(color2D.matplot) and replace "f" with "d" on line 101:
# https://stackoverflow.com/questions/44522895/plot-color-matrix-with-numbers-in-r
color2D.matplot(confusion, cellcolors=palmat, main='', show.values=1, vcol=rgb(0,0,0), axes=FALSE, vcex=1.2)
feats <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
# bar_range <- 1.1 * range(c(km_N$centers, km_N$centers))
bar_range <- c(-2.1, 5.2)
for (medium in c('NEWS', 'TWITTER')) {
if (medium == 'NEWS') {
kmeans <- km_N
X <- X_N
x <- x_N
fig <- function(j) c(0.51, 0.59, (4-j)*0.08+0.09, (5-j)*0.08+0.09)
fig_incr <- c(mar*rel_line_height[1], mar*rel_line_height[1], 0, 0)
fig_bar <- function(j) c(0.91, 0.99, (4-j)*0.08+0.09, (5-j)*0.08+0.09)
} else {
kmeans <- km_T
X <- X_T
x <- x_T
fig <- function(j) c(j*0.08+0.51, (j+1)*0.08+0.51, 0.41, 0.49)
fig_incr <- c(0, 0, -mar*rel_line_height[2], -mar*rel_line_height[2])
fig_bar <- function(j) c(j*0.08+0.51, (j+1)*0.08+0.51, 0.01, 0.09)
}
cluster_sizes <- kmeans$size
y <- do.call(cbind, lapply(1:4, function(j) colMeans(X[kmeans$cluster==j, colnames(X) %in% days], na.rm=TRUE)))
for (j in 1:4) {
# bar_range <- 1.1 * range(kmeans$centers)
par(fig=fig(j)-fig_incr, new=TRUE, mar=c(mar, mar, mar, mar))
plot(days, y[,j], type='l', ylim=range(y), las=2, col=COL[[medium]], lwd=2, xlab='', ylab='', xaxt='n', yaxt='n',
bty='n', panel.first=bg("#eeeeee"))
text(max(days), max(y), sprintf('C%d', j), adj=c(1,1), cex=1)
par(fig=fig_bar(j)+fig_incr, new=TRUE, mar=c(mar, mar, mar, mar))
barplot(kmeans$centers[j,], main='', ylab='', col=feat_cols, names.arg='', ylim=bar_range, yaxt='n', border=NA,
bty='n', panel.first=bg("#eeeeee"))
if (medium=='TWITTER' && j==4) {
mtext('z-scores', side=4, line=1.8, cex=0.5)
axis(4, at=seq(-2,5), labels=c('-2','','0','','2','','4',''), cex.axis=0.5, cex.lab=0.5,
lwd=0.8, lwd.ticks=0.8, las=1, hadj=0)
}
text(par("usr")[2]*0.95, par("usr")[4]*0.95, adj=c(1,1), cex=0.5,
sprintf('N=%d\n(%.0f%%)', kmeans$size[j], 100*kmeans$size[j]/sum(kmeans$size)))
}
}
### Matrix row label.
par(fig=c(0.50, 1, 0.50, 0.55), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
text(0.5, 0, 'Twitter', adj=c(0.5,0), col=COL[['TWITTER']], cex=1.2)
### Matrix column label.
par(fig=c(0.40, 0.59, 0.42, 0.50), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
text(1, 0, 'News', adj=c(1,0), col=COL[['NEWS']], cex=1.2)
### Barplot legend.
par(fig=c(0.45, 0.59, 0.01, 0.09), new=TRUE)
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
legend("bottomleft", legend=c(' pre-mortem mean', ' short-term boost', ' long-term boost', ' halving time'),
pch=15, pt.cex=3, cex=0.7, y.intersp=1.7, bty='n', col=feat_cols, text.col=feat_cols)
for (i in 3:1) {
delta <- 0.007
par(fig=c(i*delta, 0.4+i*delta, i*delta, 0.4+i*delta), new=TRUE, mar=c(5,5,0,0))
plot(NULL, xaxt='n', yaxt='n', xlab='', ylab='', panel.first=bg('white'), xlim=0:1, ylim=0:1)
}
par(fig=c(0, 0.50, 0.43, 0.59), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
text(0.5, 0, sprintf('Mention time series (N=%d)', dim(X)[1]), adj=c(0.5,0), cex=0.8)
par(fig=c(0, 0.40, 0, 0.40), new=TRUE, mar=c(5,5,0,0))
name <- 'Whitney_Houston'
# name <- 'Donna_Summer'
# name <- 'Amy_Winehouse'
medium <- 'NEWS'
if (medium == 'NEWS') {
X <- X_N
x <- x_N
num_art <- num_art_N
} else {
X <- X_T
x <- x_T
num_art <- num_art_T
}
cex <- 0.65 # 0.85
mid <- wiki_to_mid[sprintf('<http://en.wikipedia.org/wiki/%s>', name)]
raw <- log10(x[mid, colnames(x) %in% days])
smoothed <- X[mid, colnames(X) %in% days]
ymin <- min(raw[is.finite(raw)], na.rm=TRUE)
col <- as.vector(col2rgb(COL[[medium]])/255)
col <- rgb(col[1], col[2], col[3], 0.2)
plot(days, raw, col=col, main='', pch=20, cex=2, cex.axis=0.7, cex.lab=0.7,
bty='o', xlab='Days since death', ylab='Fraction mentioning documents [log10]', ylim=c(ymin,-1),
panel.first=c(bg('white'), abline(v=0, col='gray', lty=2)))
lines(names(smoothed), smoothed, col=COL[[medium]], lwd=2)
text(max(days), -1, sprintf('%s', gsub('_', ' ', name)), adj=c(1,1), cex=1) # cex=1.5)
y_before <- stats_N[mid, 'mean_before']
y_peak <- y_before + stats_N[mid, 'peak_mean_boost']
y_perm <- y_before + stats_N[mid, 'perm_boost']
x_tth <- stats_N[mid, 'time_till_half'] * max(as.numeric(colnames(X)))
y_tth <- mean(c(y_peak, y_perm))
# mean before
segments(x0=min(days), y0=y_before, x1=-30, lwd=2, col=feat_cols['mean_before'])
text(-60, y_before, 'pre-mortem\nmean', srt=0, adj=c(0.5,1.5), col=feat_cols['mean_before'], cex=cex)
# peak
segments(x0=0, y0=y_peak, x1=30)
arrows(x0=-40, y0=y_before, y1=y_peak, length=0.05, code=3, lwd=2, col=feat_cols['peak_mean_boost'])
text(-70, mean(c(y_before, y_peak)), 'short-\nterm\nboost', srt=0, adj=c(0.5,0.5), col=feat_cols['peak_mean_boost'], cex=cex)
# perm
segments(x0=30, y0=y_perm, x1=max(days))
arrows(x0=100, y0=y_before, y1=y_perm, length=0.05, code=3, lwd=2, col=feat_cols['perm_boost'])
text(110, mean(c(y_before, y_perm)), 'long-term boost', srt=0, adj=c(0,0.5), col=feat_cols['perm_boost'], cex=cex)
# tth
arrows(x0=1, y0=y_tth, x1=x_tth, length=0.05, code=3, lwd=2, col=feat_cols['time_till_half'])
text(x_tth+10, y_tth, 'halving time', srt=0, adj=c(0,0.5), col=feat_cols['time_till_half'], cex=cex)
### Arrow from example to cluster.
par(fig=c(0.40, 0.51, 0, 0.45), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', xlab='', ylab='', xlim=0:1, ylim=0:1)
col <- 'gray50'
arrows(x0=0, y0=0.5, x1=0.5, length=0.05, code=0, lwd=5, col=col, ljoin=1, lend=2)
arrows(x0=0.5, y0=0.5, y1=0.40, length=0.05, code=0, lwd=5, col=col, ljoin=1, lend=2)
arrows(x0=0.5, y0=0.40, x1=0.98, length=0.2, code=2, lwd=5, col=col, ljoin=1, lend=1)
text(0.50, 0.53, 'Clustering', srt=0, adj=c(0,0.5), col=col, cex=1.5, font=2, srt=90)
if (SAVE_PLOTS) dev.off()
}
num_art_N <- get_num_art('NEWS')
num_art_T <- get_num_art('TWITTER')
data_N <- get_mention_freq_table('NEWS')
data_T <- get_mention_freq_table('TWITTER')
x_N <- get_rel_date_matrix('NEWS', data_N, num_art_N, CHUNK_SIZE)
x_T <- get_rel_date_matrix('TWITTER', data_T, num_art_T, CHUNK_SIZE)
mids_N <- filter_people('NEWS', x_N)
mids_T <- filter_people('TWITTER', x_T)
mids <- intersect(mids_N, mids_T)
x_N <- x_N[mids,]
x_T <- x_T[mids,]
X_N <- normalize_and_smooth('NEWS', x_N, num_art_N, mean_center=FALSE)
X_T <- normalize_and_smooth('TWITTER', x_T, num_art_T, mean_center=FALSE)
stats_all <- load_stats_all()
stats_N <- load_stats('NEWS', x_N, X_N, num_art_N)
stats_T <- load_stats('TWITTER', x_T, X_T, num_art_T)
lmdata_N <- make_regression_data('NEWS')
lmdata_T <- make_regression_data('TWITTER')
# stats_N and stats_T contain the same people, so we arbitrarily use stats_N here.
natural <- which(stats_N$natural_death == 1)
unnatural <- which(stats_N$unnatural_death == 1)
# Are there any people with ambiguous causes of death? -- No!
length(intersect(natural, unnatural))
cluster_feats <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
cluster_result_N <- cluster(stats_N, cluster_feats)
cluster_result_T <- cluster(stats_T, cluster_feats)
k <- 4
km_N <- cluster_result_N$clustering[[which(cluster_result_N$K==k)]]
km_T <- cluster_result_T$clustering[[which(cluster_result_T$K==k)]]
plot_num_art(num_art_N, 'NEWS')
plot_num_art(num_art_T, 'TWITTER')
Figure 1: Number of documents per day in the Spinn3r corpus.
Acute myeloid leukemia, Adrenocortical carcinoma, Alveolar rhabdomyosarcoma, Alzheimer’s disease, Amyloidosis, Amyotrophic lateral sclerosis, Anemia, Aneurysm, Aortic aneurysm, Aortic dissection, Appendix cancer, Asthma, Astrocytoma, Atherosclerosis, Atypical teratoid rhabdoid tumor, B-cell chronic lymphocytic leukemia, Bladder cancer, Bleeding, Blood disorder, Blunt trauma, Bone cancer, Bone tumor, Brain Cancer, Brain damage, Brain tumor, Breast cancer, Bronchitis, Bronchopneumonia, Cancer, Cardiac arrest, Cardiac dysrhythmia, Cardiac surgery, Cardiopulmonary Arrest, Cardiovascular disease, Cerebral hemorrhage, Cerebral infarction, Cervical cancer, Cholangiocarcinoma, Chronic kidney disease, Chronic Obstructive Pulmonary Disease, Cirrhosis, Colorectal cancer, Complication, Complications from a stroke, Complications from cardiac surgery, Complications from pneumonia, Complications of diabetes mellitus, Congenital heart defect, Coronary artery disease, Craniocerebral Trauma, Creutzfeldt–Jakob disease, Cystic fibrosis, Dementia, Dementia with Lewy bodies, Diabetes mellitus, Disease, Ebola virus disease, Emphysema, Epileptic seizure, Esophageal cancer, Gallbladder cancer, Glioblastoma multiforme, Heart Ailment, Heart failure, Heart valve disease, Heat Stroke, Hepatitis, HIV/AIDS, Hodgkin’s lymphoma, Huntington’s disease, Hypertension, Hypertensive heart disease, Hyperthermia, Illness, Infection, Influenza, Internal bleeding, Intracranial aneurysm, Intracranial hemorrhage, Kidney cancer, Laryngeal cancer, Leiomyosarcoma, Leukemia, Liver cancer, Liver disease, Liver failure, Liver tumour, Lung cancer, Lung disease, Lung Infection, Lymphoma, Malaria, Melanoma, Meningitis, Mesothelioma, Metastatic breast cancer, Metastatic Melanoma, Motor neuron disease, Multiple myeloma, Multiple organ dysfunction syndrome, Multiple organ failure, Multiple sclerosis, Multiple system atrophy, Myelodysplastic syndrome, Myocardial infarction, Natural causes, Nephropathy, Non-Hodgkin lymphoma, Old age, Oral cancer, Organ dysfunction, Ovarian cancer, Pancreatic cancer, Pancreatitis, Parkinson’s disease, Peritonitis, Pneumonia, Pneumothorax, Polycythemia, Polymyalgia rheumatica, Progressive supranuclear palsy, prolonged illness, Prostate cancer, Pulmonary edema, Pulmonary embolism, Pulmonary failure, Pulmonary fibrosis, Pyelonephritis, Renal failure, Respiratory arrest, Respiratory disease, Respiratory failure, Salivary gland neoplasm, Sepsis, Septic shock, Skin cancer, Smallpox, Squamous-cell carcinoma, Stomach cancer, Stroke, Subarachnoid hemorrhage, Subdural hematoma, Surgery, Surgical complications, T-Cell Lymphoma, Terminal illness, Throat Cancer, Thrombosis, Thrombus, Thyroid cancer, Urinary tract infection, Uterine cancer, Vascular dementia, Viral pneumonia
Accident, Accidental drug overdose, Accidental fall, Airstrike, Alcohol intoxication, Ambush, Asphyxia, Assassination, Assisted suicide, Aviation accident or incident, Ballistic trauma, Bike accident, Blast injury, Boating Accident, Bomb, Brain injury, Capital punishment, Car bomb, Carbon monoxide poisoning, Casualty of war, Cocaine overdose, Combined drug intoxication, Decapitation, Drowning, Drug overdose, Execution, Execution by firing squad, Execution-style murder, Explosion, Falling, Falling from height, Fire, Firearm, Gunshot, Hanging, Helicopter crash, Heroin overdose, Hit and run, Homicide, Improvised bombing, Injury, Killed in action, Lethal injection, Lightning, Major trauma, Motorcycle accident, Mountaineering, Murder, Murder_suicide, Murder–suicide, Poison, Poisoning, Racing Accident, Self-inflicted wound, Shark attack, Shootout, Skiing accident, Smoke inhalation, Stab wound, Stabbing, Strangling, Struck by car, Suicide, Suicide attack, Suicide by hanging, Tornado Incident, Torture, Traffic collision, Traumatic brain injury
The first column represents the set of all 33340 Freebase people who died in the same time period studied (11 June 2009 to 30 September 2014). The second column represents the 2362 people studied. The third column represents the subset of the 870 people included in the regression analysis.
(A note on age: among all people in Freebase, there are some obvious age errors [e.g., 610 years]. Even when ignoring all ages above 120, the mean changes only slightly, dropping from 76.26 to 76.22.)
table <- make_bio_table()
table
## All Included Regression
## Age
## Age N/A 10% 6% 0%
## 1st quartile 68 64 60
## Mean 76 74 70
## Median 80 77 73
## 3rd quartile 88 87 84
## Gender
## Gender N/A 27% 7% 0%
## Female 16% 17% 17%
## Male 84% 83% 83%
## Manner of death
## Manner of death N/A 76% 60% 0%
## Natural 85% 86% 88%
## Unnatural 15% 14% 12%
## Cultural background
## Cultural background N/A 45% 27% 14%
## Anglophone 60% 82% 80%
## Non-anglophone 40% 18% 20%
## Notability type
## Notability type N/A 1% 0% 0%
## art 40% 50% 56%
## sports 14% 14% 14%
## leadership 11% 14% 13%
## known for death 26% 16% 10%
## general fame 7% 4% 5%
## academia/engineering 2% 2% 2%
## Count 33340 2362 870
Table 1: Biographic statistics of public figures.
(a) News
plot_time_series_for_small_multiples('Benoit_Mandelbrot', 'NEWS', X_N, x_N, num_art_N, ylab=TRUE)
plot_time_series_for_small_multiples('Kashiram_Rana', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
plot_time_series_for_small_multiples('Amy_Winehouse', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
plot_time_series_for_small_multiples('George_McGovern', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
(b) Twitter
plot_time_series_for_small_multiples('Benoit_Mandelbrot', 'TWITTER', X_T, x_T, num_art_T, ylab=TRUE)
plot_time_series_for_small_multiples('Kashiram_Rana', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
plot_time_series_for_small_multiples('Amy_Winehouse', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
plot_time_series_for_small_multiples('George_McGovern', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
Figure 2: Examples of mention time series for four deceased public figures.
overlay_time_series(X_N, 'NEWS')
overlay_time_series(X_T, 'TWITTER')
Figure 3: Overlay of all mention time series.
mmh_N <- plot_max_mem_hist('NEWS')
mmh_T <- plot_max_mem_hist('TWITTER')
Figure 4: Histogram of post-mortem peak days. Left: news. Right: Twitter. For each post-mortem day we count for how many individuals the mention frequency is largest on that day, out of the 100 days immediately following death. For most people, the maximum mention frequency is observed on the day after death.
The first entry of each vector corresponds to day 0 (i.e., the day of death).
h_N <- mmh_N$hist
h_T <- mmh_T$hist
max_days_N <- mmh_N$max_days
max_days_T <- mmh_T$max_days
probs_N <- (h_N$counts/sum(h_N$counts))[1:30]
probs_T <- (h_T$counts/sum(h_T$counts))[1:30]
News:
probs_N
## [1] 0.2167654530 0.2933954276 0.1109229467 0.0444538527 0.0309060119
## [6] 0.0258255715 0.0207451312 0.0114309907 0.0135478408 0.0101608806
## [11] 0.0076206605 0.0063505504 0.0050804403 0.0042337003 0.0038103302
## [16] 0.0033869602 0.0063505504 0.0008467401 0.0029635902 0.0029635902
## [21] 0.0046570703 0.0033869602 0.0021168501 0.0033869602 0.0042337003
## [26] 0.0021168501 0.0033869602 0.0012701101 0.0029635902 0.0025402202
Twitter:
probs_T
## [1] 0.2878916173 0.3302286198 0.0999153260 0.0482641829 0.0325994920
## [6] 0.0169348010 0.0156646909 0.0084674005 0.0042337003 0.0080440305
## [11] 0.0076206605 0.0055038103 0.0029635902 0.0029635902 0.0059271804
## [16] 0.0025402202 0.0055038103 0.0025402202 0.0012701101 0.0033869602
## [21] 0.0025402202 0.0004233700 0.0016934801 0.0008467401 0.0012701101
## [26] 0.0025402202 0.0055038103 0.0021168501 0.0016934801 0.0021168501
The first entry of each vector corresponds to day 0 (i.e., the day of death).
News:
cumsum(probs_N)
## [1] 0.2167655 0.5101609 0.6210838 0.6655377 0.6964437 0.7222693 0.7430144
## [8] 0.7544454 0.7679932 0.7781541 0.7857748 0.7921253 0.7972058 0.8014395
## [15] 0.8052498 0.8086367 0.8149873 0.8158340 0.8187976 0.8217612 0.8264183
## [22] 0.8298052 0.8319221 0.8353091 0.8395428 0.8416596 0.8450466 0.8463167
## [29] 0.8492803 0.8518205
Twitter:
cumsum(probs_T)
## [1] 0.2878916 0.6181202 0.7180356 0.7662997 0.7988992 0.8158340 0.8314987
## [8] 0.8399661 0.8441998 0.8522439 0.8598645 0.8653683 0.8683319 0.8712955
## [15] 0.8772227 0.8797629 0.8852667 0.8878069 0.8890771 0.8924640 0.8950042
## [22] 0.8954276 0.8971211 0.8979678 0.8992379 0.9017782 0.9072820 0.9093988
## [29] 0.9110923 0.9132091
Those whose peak day comes after day 29 tend to be those with low short-term boosts, as seen by computing the relative rank with respect to short-term boost of those with a late peak.
News:
max_rank <- length(max_days_N)-1
(summary(stats_N$peak_mean_boost_rank[max_days_N>29])-1) / max_rank
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2238 0.4439 0.4620 0.6837 0.9958
# Compare this to the fraction of people with a late peak day.
(sum(max_days_N>29)-1) / max_rank
## [1] 0.1478187
Twitter:
(summary(stats_T$peak_mean_boost_rank[max_days_T>29])-1) / max_rank
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002541 0.181279 0.430750 0.454508 0.713681 0.998306
# Compare this to the fraction of people with a late peak day.
(sum(max_days_T>29)-1) / max_rank
## [1] 0.08640407
coefs_N <- biexp_fit('NEWS')
coefs_T <- biexp_fit('TWITTER')
(a) News
plot_avg_fraction_of_mentioning_docs('NEWS', biexp_coefs=coefs_N)
(b) Twitter
plot_avg_fraction_of_mentioning_docs('TWITTER', biexp_coefs=coefs_T)
Figure 5: Average mention time series, obtained via the arithmetic mean of the individual raw mention time series of the people included in the study, for (a) news and (b) Twitter.
News:
coefs_N
## p N r q
## 2.326321e-01 2.738333e-05 1.882700e-02 5.124454e-04
Twitter:
coefs_T
## p N r q
## 2.525830e-01 7.082047e-07 1.018764e-02 4.961737e-04
plot_comm_vs_cult_mem('NEWS', biexp_coefs=coefs_N)
plot_comm_vs_cult_mem('TWITTER', biexp_coefs=coefs_T)
Figure 6: Proportion of cultural vs. communicative memory. On day \(t\) after death, the total collective memory according to the biexponential model fit is \(S(t)=u(t)+v(t)\), where \(u(t)\) is the communicative memory, and \(v(t)\) is the cultural memory. We plot the proportion \(v(t)/S(t)\) as a function of \(t\). Left: news. Right: Twitter.
(a) News
summ <- summary(stats_N[, c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')])
colnames(summ) <- FANCY_CURVE_CHAR_NAMES[trimws(colnames(summ))]
summ
## Pre-mortem mean Short-term boost Long-term boost Halving time
## Min. :-5.851 Min. :-0.2055 Min. :-1.5490200 Min. :0.01667
## 1st Qu.:-5.835 1st Qu.: 1.1924 1st Qu.:-0.0147554 1st Qu.:0.05903
## Median :-5.819 Median : 1.9754 Median : 0.0005455 Median :0.16111
## Mean :-5.755 Mean : 1.9171 Mean : 0.0191381 Mean :0.23172
## 3rd Qu.:-5.770 3rd Qu.: 2.6687 3rd Qu.: 0.0270671 3rd Qu.:0.35833
## Max. :-3.267 Max. : 4.2103 Max. : 1.3884812 Max. :0.95833
(b) Twitter
summ <- summary(stats_T[, c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')])
colnames(summ) <- FANCY_CURVE_CHAR_NAMES[trimws(colnames(summ))]
summ
## Pre-mortem mean Short-term boost Long-term boost Halving time
## Min. :-7.866 Min. :-0.0413 Min. :-0.557536 Min. :0.01667
## 1st Qu.:-7.839 1st Qu.: 1.5856 1st Qu.:-0.006985 1st Qu.:0.06667
## Median :-7.803 Median : 2.4474 Median : 0.016047 Median :0.15972
## Mean :-7.664 Mean : 2.3262 Mean : 0.055143 Mean :0.22363
## 3rd Qu.:-7.671 3rd Qu.: 3.0998 3rd Qu.: 0.077694 3rd Qu.:0.33611
## Max. :-4.576 Max. : 4.9904 Max. : 1.552516 Max. :0.88056
Table 2: Summaries of distribution of mention time series characteristics for (a) the news and (b) Twitter.
\[\\[4mm]\]
plot_smoothed_densities_raw(stats_N, 'NEWS', 'mid', list(unique(stats_N$mid)), main='NEWS')
plot_smoothed_densities_raw(stats_T, 'TWITTER', 'mid', list(unique(stats_T$mid)), main='TWITTER')
Figure 7: Kernel-smoothed density plots of mention time series characteristics for the news (top) and Twitter (bottom).
News:
boot_ci(stats_N$peak_mean_boost, median)
## upper point_est lower
## 2.032618 1.975446 1.898431
wilcox.test(stats_N$peak_mean_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$peak_mean_boost
## V = 2790570, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 1.886672 1.967571
## sample estimates:
## (pseudo)median
## 1.927002
Twitter:
boot_ci(stats_T$peak_mean_boost, median)
## upper point_est lower
## 2.501174 2.447381 2.370275
wilcox.test(stats_T$peak_mean_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_T$peak_mean_boost
## V = 2790610, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 2.309257 2.402026
## sample estimates:
## (pseudo)median
## 2.355734
wilcox.test(x=stats_N$peak_mean_boost, y=stats_T$peak_mean_boost, paired=TRUE, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$peak_mean_boost and stats_T$peak_mean_boost
## V = 477893, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.4415986 -0.3917311
## sample estimates:
## (pseudo)median
## -0.416588
News:
boot_ci(stats_N$perm_boost, median)
## upper point_est lower
## 0.0017170682 0.0005454994 -0.0008536635
wilcox.test(stats_N$perm_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$perm_boost
## V = 1560596, p-value = 6.199e-07
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 0.002285753 0.005541149
## sample estimates:
## (pseudo)median
## 0.003893564
Twitter:
boot_ci(stats_T$perm_boost, median)
## upper point_est lower
## 0.01748789 0.01604730 0.01329259
wilcox.test(stats_T$perm_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_T$perm_boost
## V = 2053579, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 0.02424449 0.03112195
## sample estimates:
## (pseudo)median
## 0.02754178
wilcox.test(x=stats_N$perm_boost, y=stats_T$perm_boost, paired=TRUE, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$perm_boost and stats_T$perm_boost
## V = 881590, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.02542398 -0.01933168
## sample estimates:
## (pseudo)median
## -0.02231052
(a) News
plot_chars_by_group(stats_N, 'NEWS', stats_N$type_group, type_groups)
(b) Twitter
plot_chars_by_group(stats_T, 'TWITTER', stats_T$type_group, type_groups)
Figure 8: Distributions of mention time series characteristics by notability type, visualized as box plots, for (a) the news and (b) Twitter.
plot_sil_width('NEWS', cluster_result_N$clustering, cluster_result_N$diss)
plot_sil_width('TWITTER', cluster_result_T$clustering, cluster_result_T$diss)
Figure 9: Average silhouette width of clusterings produced by \(k\)-means algorithm, as a function of the number \(k\) of clusters (higher is better), for \(k \in \{2, \dots, 30\}\). Both (a) in the news and (b) on Twitter, \(k=4\) is optimal.
plot_clustered_time_series('NEWS', km_N, X_N)
plot_clustered_time_series('TWITTER', km_T, X_T)
Figure 10: Overlay of time series in each cluster, for news (top) and Twitter (bottom).
X <- table(as.factor(km_N$cluster), as.factor(km_T$cluster))
chisq.test(X, simulate.p.value=TRUE, B=1e5)
##
## Pearson's Chi-squared test with simulated p-value (based on 1e+05
## replicates)
##
## data: X
## X-squared = 1739.3, df = NA, p-value = 1e-05
Null probabilites represent the case where news and Twitter are assumed to be independent (with the constraint that cluster sizes must remain constant).
The following matrix shows the log10 ratio of the empirical cluster size and the cluster size under the null model (rows are news cluster, columns are Twitter clusters). That is, positive values imply overrepresentation in the empirical data, and negative values, underrepresentation. We see that all diagonal entries are overrepresented, whereas all off-diagonal entries except two—\((3,4)\) and \((4,3)\)—are underrepresented.
X.indep <- (rowSums(X) %o% colSums(X)) / sum(X)
log10(X / X.indep)
##
## 1 2 3 4
## 1 0.12144877 -0.41341084 -0.03466453 -0.37941148
## 2 -0.31927109 0.43136894 -0.77466424 -0.76366886
## 3 -0.40569573 -1.34225550 0.68375399 0.72277810
## 4 -0.24239967 -0.89911874 0.01774628 1.06900484
To quantify the overrepresentation of the diagonal, we compute four diagonal sums (everything under the constraint that the row and column sums are fixed to the empirical values):
The minimum and maximum are computed via linear programming.
k <- nrow(X)
objective.in <- as.vector(diag(nrow=k))
const.mat <- rbind(t(diag(1,k) %x% rep(1,k)),
t(rep(1,k) %x% diag(1,k)))
const.rhs <- c(colSums(X), rowSums(X))
const.dir <- rep('==', k)
opt.max <- lp('max', objective.in, const.mat, const.dir, const.rhs)
X.max <- matrix(opt.max$solution, k, k)
opt.min <- lp('min', objective.in, const.mat, const.dir, const.rhs)
X.min <- matrix(opt.min$solution, k, k)
The values of the four above-described diagonal sums are as follows:
sums <- c(sum(diag(X.min)), sum(diag(X.indep)), sum(diag(X)), sum(diag(X.max)))
sums
## [1] 505.000 1059.347 1727.000 2236.000
Normalizing by the min-to-max span, we see that the empirical diagonal gets 71% of the way between min and max, whereas the null only gets 32%, for a factor of 2.2.
(sums - sum(diag(X.min))) / (sum(diag(X.max)) - sum(diag(X.min)))
## [1] 0.0000000 0.3202468 0.7059503 1.0000000
On the contrary, nearly all off-diagonal entries are underrepresented, with the exception of \((4,3)\), which is not significantly different from the null:
prop.test(X[4,3], sum(X), X.indep[4,3]/sum(X))
##
## 1-sample proportions test with continuity correction
##
## data: X[4, 3] out of sum(X), null probability X.indep[4, 3]/sum(X)
## X-squared = 5.0682e-29, df = 1, p-value = 1
## alternative hypothesis: true p is not equal to 0.002844932
## 95 percent confidence interval:
## 0.001359169 0.006264032
## sample estimates:
## p
## 0.00296359
and \((3,4)\), which is massively overrepresented, compared to the null:
prop.test(X[3,4], sum(X), X.indep[3,4]/sum(X))
##
## 1-sample proportions test with continuity correction
##
## data: X[3, 4] out of sum(X), null probability X.indep[3, 4]/sum(X)
## X-squared = 135.03, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.003206284
## 95 percent confidence interval:
## 0.01228142 0.02321996
## sample estimates:
## p
## 0.0169348
Note: the variable names in the code may differ from those in the paper; see mapping in lists below.
Independent variables:
mean_before)age_group)death_type)type_group)anglo)gender)Dependendent variables:
peak_mean_boost)perm_boost)predictors <- 'mean_before_relrank + age_group + death_type + gender + anglo + type_group'
mod_N_peak <- lm(as.formula(sprintf('peak_mean_boost_relrank ~ %s', predictors)), lmdata_N)
mod_T_peak <- lm(as.formula(sprintf('peak_mean_boost_relrank ~ %s', predictors)), lmdata_T)
mod_N_perm <- lm(as.formula(sprintf(' perm_boost_relrank ~ %s', predictors)), lmdata_N)
mod_T_perm <- lm(as.formula(sprintf(' perm_boost_relrank ~ %s', predictors)), lmdata_T)
age_names <- sprintf("Age: %d--%d", seq(20,90,10)[-6], seq(29,99,10)[-6])
type_names <- levels(lmdata_N$type_group)[-1]
table <- texreg(list(mod_N_peak, mod_T_peak, mod_N_perm, mod_T_perm), dcolumn=TRUE, booktabs=TRUE,
use.packages=FALSE, digits=3, leading.zero=FALSE, single.row=TRUE, stars = c(0.001,0.01,0.05),
custom.model.names=c("\\shortstack{Short-term boost\\\\(News)}",
"\\shortstack{Short-term boost\\\\(Twitter)}",
"\\shortstack{Long-term boost\\\\(News)}",
"\\shortstack{Long-term boost\\\\(Twitter)}"),
custom.coef.names=c("(Intercept)", "Pre-mortem mean (relative rank)", age_names, "Manner of death: unnatural",
"Gender: female", paste("Cultural background:", c("non-anglophone", "unknown")),
paste("Notability type:", type_names)),
reorder.coef=c(1,2,10,12:13,11,14:18,3:9), table=FALSE)
cat(table, file=sprintf('%s/lm_coef_T_and_N.tex', TABLEDIR), sep="\n")
print(summary(mod_N_peak))
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost_relrank ~ %s",
## predictors)), data = lmdata_N)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62044 -0.19422 0.01128 0.20580 0.74237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001595 0.020820 0.077 0.9390
## mean_before_relrank 0.292185 0.030757 9.500 < 2e-16 ***
## age_group20 0.054393 0.056290 0.966 0.3342
## age_group30 0.119792 0.055379 2.163 0.0308 *
## age_group40 -0.020965 0.041680 -0.503 0.6151
## age_group50 -0.029145 0.032728 -0.891 0.3734
## age_group60 -0.046225 0.027036 -1.710 0.0877 .
## age_group80 0.011330 0.025892 0.438 0.6618
## age_group90 0.070412 0.032351 2.177 0.0298 *
## death_typeunnatural 0.223393 0.031296 7.138 2.03e-12 ***
## genderFemale 0.032258 0.023861 1.352 0.1768
## anglonon_anglo -0.100770 0.024407 -4.129 4.01e-05 ***
## anglounknown -0.149597 0.028286 -5.289 1.57e-07 ***
## type_groupacademia/engineering 0.054180 0.065320 0.829 0.4071
## type_groupgeneral fame 0.025013 0.040994 0.610 0.5419
## type_groupknown for death -0.041843 0.032861 -1.273 0.2033
## type_groupleadership 0.027634 0.027408 1.008 0.3136
## type_groupsports 0.001862 0.027579 0.068 0.9462
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2555 on 852 degrees of freedom
## Multiple R-squared: 0.2343, Adjusted R-squared: 0.219
## F-statistic: 15.34 on 17 and 852 DF, p-value: < 2.2e-16
Table 3: Linear regression model of short-term boost in the news.
\[\\[4mm]\]
print(summary(mod_N_perm))
##
## Call:
## lm(formula = as.formula(sprintf(" perm_boost_relrank ~ %s",
## predictors)), data = lmdata_N)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60548 -0.21923 0.00579 0.23231 0.57350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.069386 0.022386 3.099 0.002002 **
## mean_before_relrank -0.035601 0.033071 -1.076 0.282009
## age_group20 0.014011 0.060525 0.231 0.816994
## age_group30 -0.025187 0.059546 -0.423 0.672407
## age_group40 -0.043987 0.044816 -0.982 0.326618
## age_group50 -0.116348 0.035190 -3.306 0.000985 ***
## age_group60 -0.085145 0.029070 -2.929 0.003491 **
## age_group80 -0.009021 0.027840 -0.324 0.745990
## age_group90 0.039381 0.034785 1.132 0.257902
## death_typeunnatural 0.134966 0.033651 4.011 6.58e-05 ***
## genderFemale 0.037946 0.025656 1.479 0.139504
## anglonon_anglo -0.108631 0.026244 -4.139 3.83e-05 ***
## anglounknown -0.145758 0.030415 -4.792 1.94e-06 ***
## type_groupacademia/engineering -0.019779 0.070235 -0.282 0.778306
## type_groupgeneral fame -0.002759 0.044079 -0.063 0.950104
## type_groupknown for death -0.064889 0.035334 -1.836 0.066638 .
## type_groupleadership -0.071294 0.029470 -2.419 0.015763 *
## type_groupsports -0.042940 0.029654 -1.448 0.147977
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2748 on 852 degrees of freedom
## Multiple R-squared: 0.1148, Adjusted R-squared: 0.0971
## F-statistic: 6.498 on 17 and 852 DF, p-value: 1.155e-14
Table 4: Linear regression model of long-term boost in the news.
\[\\[4mm]\]
print(summary(mod_T_peak))
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost_relrank ~ %s",
## predictors)), data = lmdata_T)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71673 -0.18473 -0.00208 0.20710 0.61854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.033516 0.021153 -1.584 0.113459
## mean_before_relrank 0.336775 0.031589 10.661 < 2e-16 ***
## age_group20 0.224402 0.056963 3.939 8.84e-05 ***
## age_group30 0.216619 0.056034 3.866 0.000119 ***
## age_group40 0.099944 0.042134 2.372 0.017910 *
## age_group50 0.048430 0.033115 1.462 0.143973
## age_group60 0.038284 0.027326 1.401 0.161574
## age_group80 0.003695 0.026169 0.141 0.887740
## age_group90 0.006365 0.032745 0.194 0.845930
## death_typeunnatural 0.095068 0.031662 3.003 0.002756 **
## genderFemale -0.007467 0.024089 -0.310 0.756643
## anglonon_anglo -0.030620 0.024636 -1.243 0.214240
## anglounknown -0.107934 0.028809 -3.746 0.000191 ***
## type_groupacademia/engineering 0.120612 0.066075 1.825 0.068295 .
## type_groupgeneral fame 0.052868 0.041492 1.274 0.202944
## type_groupknown for death -0.026970 0.033516 -0.805 0.421209
## type_groupleadership 0.025037 0.027535 0.909 0.363470
## type_groupsports 0.026369 0.028019 0.941 0.346917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2586 on 852 degrees of freedom
## Multiple R-squared: 0.2161, Adjusted R-squared: 0.2004
## F-statistic: 13.82 on 17 and 852 DF, p-value: < 2.2e-16
Table 5: Linear regression model of short-term boost on Twitter.
\[\\[4mm]\]
print(summary(mod_T_perm))
##
## Call:
## lm(formula = as.formula(sprintf(" perm_boost_relrank ~ %s",
## predictors)), data = lmdata_T)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67278 -0.19342 0.02505 0.21969 0.60132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.038607 0.022426 1.722 0.085511 .
## mean_before_relrank 0.118846 0.033490 3.549 0.000408 ***
## age_group20 0.111425 0.060390 1.845 0.065372 .
## age_group30 0.106071 0.059405 1.786 0.074526 .
## age_group40 0.060312 0.044669 1.350 0.177307
## age_group50 -0.060151 0.035107 -1.713 0.087008 .
## age_group60 -0.044402 0.028970 -1.533 0.125726
## age_group80 -0.015164 0.027744 -0.547 0.584804
## age_group90 -0.033081 0.034715 -0.953 0.340897
## death_typeunnatural 0.096865 0.033567 2.886 0.004004 **
## genderFemale 0.029224 0.025538 1.144 0.252806
## anglonon_anglo -0.031642 0.026118 -1.212 0.226027
## anglounknown -0.139591 0.030542 -4.570 5.59e-06 ***
## type_groupacademia/engineering 0.076049 0.070050 1.086 0.277952
## type_groupgeneral fame 0.006527 0.043988 0.148 0.882072
## type_groupknown for death -0.025012 0.035532 -0.704 0.481664
## type_groupleadership -0.102206 0.029192 -3.501 0.000487 ***
## type_groupsports -0.025534 0.029704 -0.860 0.390257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2741 on 852 degrees of freedom
## Multiple R-squared: 0.1189, Adjusted R-squared: 0.1013
## F-statistic: 6.765 on 17 and 852 DF, p-value: 2.014e-15
Table 6: Linear regression model of long-term boost on Twitter.
mods_N <- run_regression_analysis_for_death_types('NEWS')
mods_T <- run_regression_analysis_for_death_types('TWITTER')
Figure 11: Dependence of post-mortem mention frequency on age at death for news (top) and Twitter (bottom).
\[\\[4mm]\]
mod_peak_N <- mods_N$mod_peak
mod_perm_N <- mods_N$mod_perm
mod_peak_T <- mods_T$mod_peak
mod_perm_T <- mods_T$mod_perm
print(summary(mod_peak_N))
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost_relrank ~ %s",
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62147 -0.19550 0.01088 0.20087 0.73057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.2876073 0.0310288 9.269 < 2e-16 ***
## age_group70 -0.0010522 0.0211400 -0.050 0.96031
## age_group20 0.1238746 0.0881930 1.405 0.16051
## age_group30 0.0812147 0.0791111 1.027 0.30491
## age_group40 -0.0379687 0.0449768 -0.844 0.39881
## age_group50 -0.0170764 0.0323044 -0.529 0.59722
## age_group60 -0.0432214 0.0229844 -1.880 0.06039 .
## age_group80 0.0149510 0.0207886 0.719 0.47222
## age_group90 0.0749740 0.0279819 2.679 0.00752 **
## death_typeunnatural 0.3175289 0.0993996 3.194 0.00145 **
## genderFemale 0.0340646 0.0239914 1.420 0.15602
## anglonon_anglo -0.1047908 0.0247235 -4.239 2.5e-05 ***
## anglounknown -0.1532241 0.0284546 -5.385 9.4e-08 ***
## type_groupacademia/engineering 0.0530173 0.0654330 0.810 0.41802
## type_groupgeneral fame 0.0191595 0.0412557 0.464 0.64247
## type_groupknown for death -0.0403775 0.0330598 -1.221 0.22229
## type_groupleadership 0.0299099 0.0275100 1.087 0.27724
## type_groupsports -0.0005107 0.0277366 -0.018 0.98531
## age_group20:death_typeunnatural -0.1915827 0.1440851 -1.330 0.18399
## age_group30:death_typeunnatural -0.0252587 0.1410809 -0.179 0.85795
## age_group40:death_typeunnatural -0.0343547 0.1252681 -0.274 0.78396
## age_group50:death_typeunnatural -0.1337441 0.1183163 -1.130 0.25863
## age_group60:death_typeunnatural -0.0985255 0.1169552 -0.842 0.39979
## age_group80:death_typeunnatural -0.1499235 0.1799365 -0.833 0.40497
## age_group90:death_typeunnatural -0.3572365 0.2769449 -1.290 0.19743
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2559 on 845 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.2157
## F-statistic: 10.57 on 25 and 845 DF, p-value: < 2.2e-16
Table 7: Linear regression model of short-term boost in the news, with added “age by manner of death” interaction.
\[\\[4mm]\]
print(summary(mod_perm_N))
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost_relrank ~ %s", predictors)),
## data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63192 -0.21951 0.00156 0.23031 0.57993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank -0.039227 0.033372 -1.175 0.240153
## age_group70 0.070331 0.022736 3.093 0.002045 **
## age_group20 0.096091 0.094853 1.013 0.311325
## age_group30 0.041532 0.085085 0.488 0.625588
## age_group40 0.014224 0.048373 0.294 0.768798
## age_group50 -0.025681 0.034744 -0.739 0.460022
## age_group60 -0.022031 0.024720 -0.891 0.373062
## age_group80 0.062061 0.022358 2.776 0.005630 **
## age_group90 0.112468 0.030095 3.737 0.000199 ***
## death_typeunnatural 0.147611 0.106906 1.381 0.167721
## genderFemale 0.038463 0.025803 1.491 0.136437
## anglonon_anglo -0.113644 0.026591 -4.274 2.14e-05 ***
## anglounknown -0.149934 0.030603 -4.899 1.15e-06 ***
## type_groupacademia/engineering -0.019377 0.070374 -0.275 0.783122
## type_groupgeneral fame -0.008856 0.044371 -0.200 0.841847
## type_groupknown for death -0.064415 0.035556 -1.812 0.070396 .
## type_groupleadership -0.070072 0.029587 -2.368 0.018094 *
## type_groupsports -0.044702 0.029831 -1.498 0.134378
## age_group20:death_typeunnatural -0.027778 0.154966 -0.179 0.857782
## age_group30:death_typeunnatural -0.004357 0.151735 -0.029 0.977101
## age_group40:death_typeunnatural 0.026451 0.134728 0.196 0.844400
## age_group50:death_typeunnatural -0.098592 0.127251 -0.775 0.438686
## age_group60:death_typeunnatural 0.059818 0.125787 0.476 0.634521
## age_group80:death_typeunnatural -0.009561 0.193525 -0.049 0.960608
## age_group90:death_typeunnatural -0.288201 0.297859 -0.968 0.333534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2753 on 845 degrees of freedom
## Multiple R-squared: 0.1189, Adjusted R-squared: 0.09283
## F-statistic: 4.561 on 25 and 845 DF, p-value: 2.333e-12
Table 8: Linear regression model of long-term boost in the news, with added “age by manner of death” interaction.
\[\\[4mm]\]
print(summary(mod_peak_T))
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost_relrank ~ %s",
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71524 -0.18611 -0.00282 0.20651 0.61822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.334205 0.031828 10.500 < 2e-16 ***
## age_group70 -0.033041 0.021488 -1.538 0.124511
## age_group20 0.237288 0.089330 2.656 0.008049 **
## age_group30 0.175763 0.080115 2.194 0.028516 *
## age_group40 0.035998 0.045584 0.790 0.429916
## age_group50 0.020332 0.032761 0.621 0.535032
## age_group60 0.005842 0.023277 0.251 0.801897
## age_group80 -0.026348 0.021223 -1.241 0.214778
## age_group90 -0.025258 0.028355 -0.891 0.373308
## death_typeunnatural 0.108666 0.100663 1.079 0.280675
## genderFemale -0.007697 0.024242 -0.318 0.750933
## anglonon_anglo -0.033544 0.024986 -1.342 0.179797
## anglounknown -0.109642 0.029019 -3.778 0.000169 ***
## type_groupacademia/engineering 0.118763 0.066238 1.793 0.073337 .
## type_groupgeneral fame 0.047796 0.041796 1.144 0.253129
## type_groupknown for death -0.026487 0.033731 -0.785 0.432540
## type_groupleadership 0.026762 0.027648 0.968 0.333343
## type_groupsports 0.023447 0.028200 0.831 0.405954
## age_group20:death_typeunnatural -0.078726 0.145910 -0.540 0.589650
## age_group30:death_typeunnatural 0.001729 0.142781 0.012 0.990343
## age_group40:death_typeunnatural 0.082247 0.126535 0.650 0.515874
## age_group50:death_typeunnatural -0.030965 0.119968 -0.258 0.796386
## age_group60:death_typeunnatural -0.014870 0.118442 -0.126 0.900118
## age_group80:death_typeunnatural -0.166931 0.182080 -0.917 0.359509
## age_group90:death_typeunnatural -0.125260 0.280592 -0.446 0.655414
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2591 on 845 degrees of freedom
## Multiple R-squared: 0.2191, Adjusted R-squared: 0.196
## F-statistic: 9.482 on 25 and 845 DF, p-value: < 2.2e-16
Table 9: Linear regression model of short-term boost on Twitter, with added “age by manner of death” interaction.
\[\\[4mm]\]
print(summary(mod_perm_T))
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost_relrank ~ %s", predictors)),
## data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70715 -0.19327 0.02603 0.21907 0.59020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.116011 0.033760 3.436 0.000618 ***
## age_group70 0.038062 0.022793 1.670 0.095301 .
## age_group20 0.071226 0.094752 0.752 0.452437
## age_group30 0.179281 0.084978 2.110 0.035174 *
## age_group40 0.097324 0.048351 2.013 0.044444 *
## age_group50 -0.009159 0.034750 -0.264 0.792173
## age_group60 -0.008024 0.024690 -0.325 0.745276
## age_group80 0.023841 0.022511 1.059 0.289860
## age_group90 0.007153 0.030076 0.238 0.812081
## death_typeunnatural 0.122294 0.106773 1.145 0.252381
## genderFemale 0.029356 0.025713 1.142 0.253903
## anglonon_anglo -0.034279 0.026503 -1.293 0.196225
## anglounknown -0.141044 0.030780 -4.582 5.29e-06 ***
## type_groupacademia/engineering 0.076314 0.070259 1.086 0.277705
## type_groupgeneral fame 0.003185 0.044332 0.072 0.942744
## type_groupknown for death -0.024310 0.035778 -0.679 0.497031
## type_groupleadership -0.101816 0.029326 -3.472 0.000543 ***
## type_groupsports -0.023928 0.029912 -0.800 0.423974
## age_group20:death_typeunnatural 0.090805 0.154766 0.587 0.557546
## age_group30:death_typeunnatural -0.082225 0.151447 -0.543 0.587321
## age_group40:death_typeunnatural -0.019321 0.134215 -0.144 0.885570
## age_group50:death_typeunnatural -0.079483 0.127249 -0.625 0.532389
## age_group60:death_typeunnatural -0.001916 0.125630 -0.015 0.987838
## age_group80:death_typeunnatural -0.006991 0.193131 -0.036 0.971132
## age_group90:death_typeunnatural -0.144877 0.297622 -0.487 0.626539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2749 on 845 degrees of freedom
## Multiple R-squared: 0.1214, Adjusted R-squared: 0.0954
## F-statistic: 4.67 on 25 and 845 DF, p-value: 8.867e-13
Table 10: Linear regression model of long-term boost on Twitter, with added “age by manner of death” interaction.
lmdata <- lmdata_N
lmdata$mean_before_relrank_N <- lmdata_N$mean_before_relrank
lmdata$mean_before_relrank_T <- lmdata_T$mean_before_relrank
lmdata$mean_before_relrank_diff <- lmdata$mean_before_relrank_N - lmdata$mean_before_relrank_T
lmdata$peak_mean_boost_relrank_N <- lmdata_N$peak_mean_boost_relrank
lmdata$peak_mean_boost_relrank_T <- lmdata_T$peak_mean_boost_relrank
lmdata$peak_mean_boost_relrank_diff <- lmdata$peak_mean_boost_relrank_N - lmdata$peak_mean_boost_relrank_T
lmdata$perm_boost_relrank_N <- lmdata_N$perm_boost_relrank
lmdata$perm_boost_relrank_T <- lmdata_T$perm_boost_relrank
lmdata$perm_boost_relrank_diff <- lmdata$perm_boost_relrank_N - lmdata$perm_boost_relrank_T
predictors <- 'mean_before_relrank_diff + age_group + death_type + gender + anglo + type_group'
mod_peak <- lm(as.formula(sprintf('peak_mean_boost_relrank_diff ~ %s', predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf('perm_boost_relrank_diff ~ %s', predictors)), lmdata)
age_names <- sprintf("Age: %d--%d", seq(20,90,10)[-6], seq(29,99,10)[-6])
type_names <- levels(lmdata_N$type_group)[-1]
table <- texreg(list(mod_peak, mod_perm), dcolumn=TRUE, booktabs=TRUE,
use.packages=FALSE, digits=3, leading.zero=FALSE, single.row=TRUE, stars = c(0.001,0.01,0.05),
label = "table:coefficients News vs. Twitter", custom.model.names=c("Short-term boost", "Long-term boost"),
custom.coef.names=c("(Intercept)", "Pre-mortem mean (relative-rank diff.)", age_names,
"Manner of death: unnatural",
"Gender: female", paste("Cultural background:", c("non-anglophone", "unknown")),
paste("Notability type:", type_names)),
reorder.coef=c(1,2,10,12:13,11,14:18,3:9), table=FALSE)
cat(table, file=sprintf('%s/lm_coef_T_vs_N.tex', TABLEDIR), sep="\n")
print(summary(mod_peak))
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost_relrank_diff ~ %s",
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6496 -0.1208 0.0038 0.1106 0.7404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.007131 0.016294 0.438 0.661761
## mean_before_relrank_diff -0.077831 0.029007 -2.683 0.007434 **
## age_group20 -0.177328 0.043731 -4.055 5.47e-05 ***
## age_group30 -0.091639 0.043024 -2.130 0.033461 *
## age_group40 -0.112473 0.032340 -3.478 0.000531 ***
## age_group50 -0.058774 0.025457 -2.309 0.021195 *
## age_group60 -0.061181 0.021051 -2.906 0.003753 **
## age_group80 0.012762 0.020081 0.636 0.525260
## age_group90 0.072718 0.025139 2.893 0.003917 **
## death_typeunnatural 0.132598 0.024312 5.454 6.46e-08 ***
## genderFemale 0.030131 0.018500 1.629 0.103744
## anglonon_anglo -0.077795 0.018896 -4.117 4.21e-05 ***
## anglounknown -0.017957 0.021921 -0.819 0.412906
## type_groupacademia/engineering -0.026667 0.050811 -0.525 0.599843
## type_groupgeneral fame -0.011339 0.031869 -0.356 0.722071
## type_groupknown for death 0.028830 0.025684 1.123 0.261965
## type_groupleadership 0.060712 0.021578 2.814 0.005011 **
## type_groupsports 0.004572 0.021517 0.212 0.831794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1985 on 852 degrees of freedom
## Multiple R-squared: 0.1042, Adjusted R-squared: 0.08629
## F-statistic: 5.827 on 17 and 852 DF, p-value: 9.067e-13
Table 11: Linear regression model of news-vs.-Twitter difference in short-term boost.
\[\\[4mm]\]
print(summary(mod_perm))
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost_relrank_diff ~ %s",
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92186 -0.12959 0.01391 0.14260 0.95349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.008043 0.022742 0.354 0.72367
## mean_before_relrank_diff -0.230628 0.040485 -5.697 1.68e-08 ***
## age_group20 -0.105519 0.061035 -1.729 0.08420 .
## age_group30 -0.128012 0.060048 -2.132 0.03331 *
## age_group40 -0.106048 0.045137 -2.349 0.01903 *
## age_group50 -0.043230 0.035530 -1.217 0.22405
## age_group60 -0.026262 0.029381 -0.894 0.37165
## age_group80 0.004226 0.028027 0.151 0.88019
## age_group90 0.080720 0.035086 2.301 0.02165 *
## death_typeunnatural 0.039642 0.033933 1.168 0.24303
## genderFemale 0.007763 0.025820 0.301 0.76373
## anglonon_anglo -0.074869 0.026373 -2.839 0.00464 **
## anglounknown 0.023978 0.030594 0.784 0.43341
## type_groupacademia/engineering -0.069726 0.070916 -0.983 0.32578
## type_groupgeneral fame 0.003712 0.044479 0.083 0.93352
## type_groupknown for death 0.001229 0.035846 0.034 0.97266
## type_groupleadership 0.066667 0.030116 2.214 0.02711 *
## type_groupsports 0.008463 0.030031 0.282 0.77816
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2771 on 852 degrees of freedom
## Multiple R-squared: 0.0743, Adjusted R-squared: 0.05583
## F-statistic: 4.023 on 17 and 852 DF, p-value: 9.057e-08
Table 12: Linear regression model of news-vs.-Twitter difference in long-term boost.
predictors <- '0 + mean_before_relrank_diff + age_group + death_type + gender + anglo + type_group + age_group:death_type'
mod_peak <- lm(as.formula(sprintf('peak_mean_boost_relrank_diff ~ %s', predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf('perm_boost_relrank_diff ~ %s', predictors)), lmdata)
ylim <- c(-0.45, 0.45)
par(mfrow=c(1,2))
plot_age_coeffs_for_death_types(mod_peak, NA, 'peak_mean_boost_diff', 70, ylim, 'black')
plot_age_coeffs_for_death_types(mod_perm, NA, 'perm_boost_diff', 70, ylim, 'black')
Figure 12: Dependence of news-vs.-Twitter boost difference (for fixed person) on age at death.
\[\\[4mm]\]
print(summary(mod_peak))
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost_relrank_diff ~ %s",
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64846 -0.11978 0.00327 0.10887 0.74189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank_diff -0.079660 0.029192 -2.729 0.006487 **
## age_group70 0.005902 0.016535 0.357 0.721241
## age_group20 -0.131089 0.068543 -1.913 0.056148 .
## age_group30 -0.142074 0.061578 -2.307 0.021283 *
## age_group40 -0.100671 0.035010 -2.876 0.004135 **
## age_group50 -0.052507 0.025113 -2.091 0.036838 *
## age_group60 -0.051830 0.017834 -2.906 0.003754 **
## age_group80 0.018036 0.016191 1.114 0.265611
## age_group90 0.080562 0.021794 3.696 0.000233 ***
## death_typeunnatural 0.155420 0.077349 2.009 0.044819 *
## genderFemale 0.032534 0.018614 1.748 0.080860 .
## anglonon_anglo -0.076134 0.019147 -3.976 7.6e-05 ***
## anglounknown -0.018445 0.022060 -0.836 0.403316
## type_groupacademia/engineering -0.026851 0.050923 -0.527 0.598128
## type_groupgeneral fame -0.010738 0.032091 -0.335 0.738007
## type_groupknown for death 0.027646 0.025829 1.070 0.284766
## type_groupleadership 0.060264 0.021657 2.783 0.005512 **
## type_groupsports 0.004710 0.021646 0.218 0.827786
## age_group20:death_typeunnatural -0.081471 0.112014 -0.727 0.467224
## age_group30:death_typeunnatural 0.071281 0.109837 0.649 0.516537
## age_group40:death_typeunnatural -0.038913 0.097332 -0.400 0.689404
## age_group50:death_typeunnatural -0.020927 0.092143 -0.227 0.820392
## age_group60:death_typeunnatural -0.046500 0.090943 -0.511 0.609270
## age_group80:death_typeunnatural 0.071107 0.139837 0.508 0.611238
## age_group90:death_typeunnatural -0.166528 0.215139 -0.774 0.439120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1989 on 845 degrees of freedom
## Multiple R-squared: 0.1079, Adjusted R-squared: 0.08154
## F-statistic: 4.089 on 25 and 845 DF, p-value: 1.466e-10
Table 13: Linear regression model of news-vs.-Twitter difference in short-term boost, with added “age by manner of death” interaction.
\[\\[4mm]\]
print(summary(mod_perm))
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost_relrank_diff ~ %s",
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92420 -0.13055 0.01538 0.14045 0.95719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank_diff -0.2341465 0.0407524 -5.746 1.28e-08 ***
## age_group70 0.0106514 0.0230835 0.461 0.64461
## age_group20 0.0031530 0.0956873 0.033 0.97372
## age_group30 -0.1718587 0.0859643 -1.999 0.04591 *
## age_group40 -0.1054249 0.0488746 -2.157 0.03128 *
## age_group50 -0.0333427 0.0350581 -0.951 0.34184
## age_group60 -0.0213921 0.0248968 -0.859 0.39046
## age_group80 0.0133539 0.0226032 0.591 0.55481
## age_group90 0.0899990 0.0304251 2.958 0.00318 **
## death_typeunnatural -0.0135298 0.1079812 -0.125 0.90032
## genderFemale 0.0080192 0.0259857 0.309 0.75770
## anglonon_anglo -0.0751648 0.0267296 -2.812 0.00504 **
## anglounknown 0.0227608 0.0307967 0.739 0.46007
## type_groupacademia/engineering -0.0697905 0.0710894 -0.982 0.32651
## type_groupgeneral fame 0.0031289 0.0448002 0.070 0.94434
## type_groupknown for death -0.0002293 0.0360578 -0.006 0.99493
## type_groupleadership 0.0668489 0.0302337 2.211 0.02730 *
## type_groupsports 0.0050713 0.0302191 0.168 0.86677
## age_group20:death_typeunnatural -0.0923192 0.1563740 -0.590 0.55510
## age_group30:death_typeunnatural 0.1416421 0.1533360 0.924 0.35589
## age_group40:death_typeunnatural 0.0782650 0.1358777 0.576 0.56477
## age_group50:death_typeunnatural 0.0504094 0.1286343 0.392 0.69524
## age_group60:death_typeunnatural 0.0908410 0.1269589 0.716 0.47449
## age_group80:death_typeunnatural 0.0187045 0.1952165 0.096 0.92369
## age_group90:death_typeunnatural -0.0496572 0.3003405 -0.165 0.86872
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2777 on 845 degrees of freedom
## Multiple R-squared: 0.07775, Adjusted R-squared: 0.05046
## F-statistic: 2.849 on 25 and 845 DF, p-value: 4.86e-06
Table 14: Linear regression model of news-vs.-Twitter difference in long-term boost, with added “age by manner of death” interaction.